<a href="https://colab.research.google.com/github/LewNeko/Data-Mining-Group-Project/blob/main/TB_4_26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("Recipe Reviews and User Feedback Dataset.csv")
df = df[df['stars'] > 0]  # Remove rows with 0-star ratings
df[['text', 'stars', 'thumbs_up', 'thumbs_down']].head()

Unnamed: 0,text,stars,thumbs_up,thumbs_down
0,"I tweaked it a little, removed onions because ...",5,0,0
1,Bush used to have a white chili bean and it ma...,5,7,0
2,I have a very complicated white chicken chili ...,5,3,0
5,amazing! my boyfriend loved it so much! going ...,5,3,1
6,Wow!!! This recipe is excellent as written!! ...,5,11,0


In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [6]:
df_clean = df[['text', 'stars', 'thumbs_up', 'thumbs_down']].dropna()
df_clean = df_clean[df_clean['text'].str.strip() != '']
df_clean['processed_text'] = df_clean['text'].apply(preprocess_text)
df_clean[['text', 'processed_text', 'stars']].head()

Unnamed: 0,text,processed_text,stars
0,"I tweaked it a little, removed onions because ...",tweak littl remov onion onion hater hous use i...,5
1,Bush used to have a white chili bean and it ma...,bush use white chili bean made recip super sim...,5
2,I have a very complicated white chicken chili ...,complic white chicken chili recip made year ev...,5
5,amazing! my boyfriend loved it so much! going ...,amaz boyfriend love much go make week,5
6,Wow!!! This recipe is excellent as written!! ...,wow recip excel written chang made use oz jar ...,5


In [7]:
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df_clean['processed_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,abl,absolut,accord,actual,ad,adapt,add,addit,adjust,admit,...,yeast,yellow,yesterday,yield,yogurt,youll,yr,yum,yummi,zucchini
0,0.0,0.151083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df_clean[['thumbs_up', 'thumbs_down']]), columns=['thumbs_up_norm', 'thumbs_down_norm'])
scaled_df.head()

Unnamed: 0,thumbs_up_norm,thumbs_down_norm
0,0.0,0.0
1,0.066038,0.0
2,0.028302,0.0
3,0.028302,0.007937
4,0.103774,0.0


In [9]:
final_df = pd.concat([df_clean[['stars']], scaled_df, tfidf_df], axis=1)
final_df.to_excel("final_preprocessed_dataset.xlsx", index=False)
final_df.head()

Unnamed: 0,stars,thumbs_up_norm,thumbs_down_norm,abl,absolut,accord,actual,ad,adapt,add,...,yeast,yellow,yesterday,yield,yogurt,youll,yr,yum,yummi,zucchini
0,5.0,0.0,0.0,0.0,0.151083,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,0.066038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.028302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5.0,0.056604,0.0,0.0,0.133056,0.0,0.0,0.0,0.0,0.107099,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
import os
import json
import csv
import collections
from scipy.stats import zscore
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

df_clean1 = final_df.copy()
df_clean1 = df_clean1.dropna()
final_input_features = df_clean1.drop(['stars'], axis=1)
x_train1, x_test1, y_train1, y_test1 = train_test_split(final_input_features, df_clean1['stars'], test_size=0.2, random_state=42)
#from lab 4_2
#train_data, test_data, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
# Scaling the features to be between 0 and 1.
scaler = MinMaxScaler()
train_data = scaler.fit_transform(x_train1)
test_data = scaler.transform(x_test1)

In [21]:
# Define the layers in the network********************************************************************LAYERS
# Import the layers and the model
from tensorflow import keras
from keras.layers import Input
from keras.layers import Dense
from keras.models import Model
inputs = Input(shape=(1002,))
dense1 = Dense(units=1002, activation='relu')(inputs)
#dense2 = Dense(units=9, activation='relu')(dense1)
#dense3 = Dense(units=9, activation='relu')(dense2)
outputs = Dense(units=1, activation='sigmoid')(dense1)

In [22]:
# Define the model by providing the inputs and outputs
model_1 = Model(inputs, outputs)

# Model summary
model_1.summary()
opt = keras.optimizers.Adam(learning_rate=0.03) #*********************************************************LR
model_1.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

history = model_1.fit(x_train1, y_train1, epochs=100, batch_size=64, verbose=0)


In [23]:
# Evaluate on test dataset
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

evals_test = model_1.evaluate(test_data, y_test1)
#print("Classification Accuracy: ", evals_test[1])

# evals_test contains the loss and accuracy of the model
#print(evals_test)

predictions = model_1.predict(test_data)
#print(predictions.shape)
#print(np.around(predictions[:6], 3))

#predictions = model_1.predict(test_data)

accuracy = accuracy_score(y_test1, np.round(predictions))
#print('The test accuracy is {0:6.4f} %'.format(accuracy*100))

# Getting the confusion matrix
confmat = confusion_matrix(y_test1, np.round(predictions), labels=[1,0])
print("Confusion Matrix:")
print(confmat)

print(classification_report(y_test1, np.round(predictions)))

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0169 - loss: -3995643392.0000
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Confusion Matrix:
[[58  0]
 [ 0  0]]
              precision    recall  f1-score   support

         1.0       0.02      1.00      0.04        58
         2.0       0.00      0.00      0.00        40
         3.0       0.00      0.00      0.00        88
         4.0       0.00      0.00      0.00       290
         5.0       0.00      0.00      0.00      2515

    accuracy                           0.02      2991
   macro avg       0.00      0.20      0.01      2991
weighted avg       0.00      0.02      0.00      2991



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
