In [1]:
import numpy as np


np.random.seed(70103)

# Problem 1

## Part 2

In [2]:
from scipy.spatial.distance import euclidean


sentence = np.array([2, 1, 1])  # vector corresponding to "see eye to eye", words ordered alphabetically
query = np.array([1, 0, 0])     # vector corresponding to "eye" following the above order

# compute Euclidean distance
euclidean_distance = euclidean(query, sentence)
euclidean_distance

1.7320508075688772

## Part 3

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# define documents
D1 = "Sandwich is a common lunch in the UK."
D2 = "In Denmark, workers have lunch at midday."

# vectorize the documents with a custom token pattern
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
vectors = vectorizer.fit_transform([D1, D2])

pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())

# compute cosine similarity
cosine_sim = cosine_similarity(vectors[0], vectors[1])[0][0]
cosine_sim

0.26726124191242434

In [4]:
pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out(), index=["D1", "D2"])

Unnamed: 0,a,at,common,denmark,have,in,is,lunch,midday,sandwich,the,uk,workers
D1,1,0,1,0,0,1,1,1,0,1,1,1,0
D2,0,1,0,1,1,1,0,1,1,0,0,0,1


## Part 4

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
# compute the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform([D1, D2])

# convert the matrix to a dense format and get feature names
tfidf_dense = tfidf_matrix.todense()
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_dense, columns=feature_names, index=["D1", "D2"])
pd.set_option('display.float_format', lambda x: '%.3f' % x)
tfidf_df

Unnamed: 0,a,at,common,denmark,have,in,is,lunch,midday,sandwich,the,uk,workers
D1,0.378,0.0,0.378,0.0,0.0,0.269,0.378,0.269,0.0,0.378,0.378,0.378,0.0
D2,0.0,0.408,0.0,0.408,0.408,0.29,0.0,0.29,0.408,0.0,0.0,0.0,0.408


# Problem 2

In [6]:
import re

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [7]:
# load data, assuming the file RAW_recipes.csv is in the same directory as the notebook
file_path = 'RAW_recipes.csv'
data = pd.read_csv(file_path)
descriptions_raw = data[data["description"].notna()][:1000][["description"]]  # get the first not NA 1000 descriptions
descriptions_raw

Unnamed: 0,description
0,autumn is my favorite time of year to cook! th...
1,this recipe calls for the crust to be prebaked...
2,this modified version of 'mom's' chili was a h...
3,"this is a super easy, great tasting, make ahea..."
4,my dh's amish mother raised him on this recipe...
...,...
1006,classic from florida's famous columbia restaur...
1007,this is from cajun-recipes.com.
1008,"perfect for a warm summer day, this is from a ..."
1009,i found this recipe in a cookbook of old ameri...


In [8]:
# download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    # remove numbers and symbols
    text = re.sub(r'[\d\W]+', ' ', text)
    # remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # perform stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joanalevtcheva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joanalevtcheva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
descriptions = descriptions_raw['description'].apply(preprocess_text)
descriptions

0       autumn favorit time year cook recip prepar eit...
1       recip call crust prebak bit ad ingredi feel fr...
2       modifi version mom chili hit christma parti ma...
3       super easi great tast make ahead side dish loo...
4       dh amish mother rais recip much prefer store b...
                              ...                        
1006    classic florida famou columbia restaur whole r...
1007                                      cajun recip com
1008    perfect warm summer day forgotten recip cookbo...
1009    found recip cookbook old american magazin reci...
1010    orang extract option make ice complement flavo...
Name: description, Length: 1000, dtype: object

In [10]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words="english")
# SVD with top 100 singulare values
svd = TruncatedSVD(100)
lsa = make_pipeline(tfidf_vectorizer, svd)

# fit LSA
X = lsa.fit_transform(descriptions)
X.shape

(1000, 100)

In [11]:
# top terms for each component
terms = tfidf_vectorizer.get_feature_names_out()
for i, comp in enumerate(svd.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:10]
    print("Topic " + str(i) + ": " + ", ".join([t[0] for t in sorted_terms]))

print("Explained Variance Ratio:", np.sum(svd.explained_variance_ratio_))

Topic 0: recip, make, easi, use, like, time, good, love, tast, great
Topic 1: easi, quick, dinner, super, make, bread, dish, kraft, great, good
Topic 2: good, cake, love, chocol, recip, got, quick, everyon, simpl, tri
Topic 3: good, soup, serv, use, chicken, low, fat, cream, meat, bean
Topic 4: delici, tri, look, kraft, dish, magazin, tasti, time, cook, good
Topic 5: cake, bread, tri, time, chocol, delici, includ, cooki, flour, wheat
Topic 6: delici, cake, like, tri, love, dish, look, make, tast, nice
Topic 7: bread, soup, tri, delici, recip, love, look, bean, wheat, wonder
Topic 8: great, time, tast, kid, magazin, love, look, tri, day, good
Topic 9: good, year, famili, time, make, delici, includ, favorit, simpl, bread
Topic 10: dish, serv, quick, cake, bread, meal, favorit, fat, famili, delici
Topic 11: tast, wonder, serv, like, dessert, salad, easi, food, famili, best
Topic 12: tri, salad, dinner, cake, great, sauc, serv, food, dress, use
Topic 13: quick, fat, tast, delici, love, low

In [12]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt

# # Reducing the dimensionality of the data to 2 dimensions using t-SNE for visualization
# tsne = TSNE(n_components=2, random_state=0, perplexity=15)
# X_tsne = tsne.fit_transform(X)

# plt.figure(figsize=(8, 8))
# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], marker='.')
# plt.title('t-SNE visualization of LSA topics')
# plt.xlabel('t-SNE feature 1')
# plt.ylabel('t-SNE feature 2')

# plt.show()

In [13]:
from sklearn.metrics.pairwise import cosine_similarity


test_descr = data[-56:-55]["description"].values
# test_descr = data[-66:-65]["description"].values
# test_descr = data[-1:]["description"].values
print(test_descr[0])
res = lsa.transform(test_descr)
cos_sim = cosine_similarity(X, res)

argmax_cs = np.argmax(cos_sim)
print(argmax_cs)
print(cos_sim[argmax_cs][0])
print(descriptions_raw["description"][argmax_cs])

# 613
# 0.606620611753556

good healthy low fat soup.
170
0.8051080803273803
a super, veggie-packed salad.  courtesy of rachael ray.


# Problem 3

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [15]:
# downloading NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    # remove numbers and symbols
    text = re.sub(r'[\d\W]+', ' ', text)
    # remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # perform stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joanalevtcheva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joanalevtcheva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Loading and Pre-Processing

In [16]:
# Load the provided dataset
file_path = 'reviews_data.csv'
data = pd.read_csv(file_path)
data = data[(data["Rating"].notna()) & (data["Review"] != "No Review Text")]
data['processed_review'] = data['Review'].apply(preprocess_text)
# consider ratings 4 and 5 as positive (1) and the rest as negative (0)
data['sentiment'] = data['Rating'].apply(lambda x: 1 if x > 3 else 0)
data


Unnamed: 0,name,location,Date,Rating,Review,Image_Links,processed_review,sentiment
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.000,Amber and LaDonna at the Starbucks on Southwes...,['No Images'],amber ladonna starbuck southwest parkway alway...,1
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.000,** at the Starbucks by the fire station on 436...,['No Images'],starbuck fire station altamont spring fl made ...,1
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.000,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...,want go way recogn starbuck employe billi fran...,1
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.000,Me and my friend were at Starbucks and my card...,['No Images'],friend starbuck card work thank worker paid dr...,1
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.000,I’m on this kick of drinking 5 cups of warm wa...,['https://media.consumeraffairs.com/files/cach...,kick drink cup warm water work instacart right...,1
...,...,...,...,...,...,...,...,...
700,Margaret,"Cotati, CA","Reviewed Oct. 2, 2011",1.000,I ordered Via Starbucks coffee online. I recei...,['No Images'],order via starbuck coffe onlin receiv email st...,0
701,Ric,"Oakville, ON","Reviewed Aug. 31, 2011",3.000,"My name is Ric **, I am journalist by professi...",['No Images'],name ric journalist profess send letter starbu...,0
702,Jayne,"Ny, NY","Reviewed Aug. 24, 2011",1.000,"The bagel was ice cold, not cut and not toasted.",['No Images'],bagel ice cold cut toast,0
703,Norma,"La Puente, CA","Reviewed Aug. 15, 2011",1.000,"In the morning of Monday, August 15, 2011, at ...",['No Images'],morn monday august co worker stop starbuck buy...,0


### Modelling

In [17]:
# extracting features and labels
reviews = data['processed_review']
sentiments = data['sentiment']

# apply TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=100)
features = tfidf_vectorizer.fit_transform(reviews)

# splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, sentiments, test_size=0.3, random_state=42)

# train Logistic Regression
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

# predicting and evaluating the model
y_pred = log_reg_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [18]:
accuracy

0.8293838862559242

In [19]:
test_texts = ["My experience at Starbucks is great", "The Americano tastes awful"]
# transform the test data using the same TF-IDF vectorizer used for training the logistic regression model
test_features_lr = tfidf_vectorizer.transform(test_texts)

# predict sentiment
predictions_lr = log_reg_model.predict(test_features_lr)

# map numerical predictions back to sentiment labels
predicted_sentiments_lr = ["positive" if prediction == 1 else "negative" for prediction in predictions_lr]
predicted_sentiments_lr

['positive', 'negative']