In [108]:
from nltk.corpus import twitter_samples # importing samples
import numpy as np

import sentiment_analysis as sentiment_analysis
import pandas as pd
import os

# BUCKET_URL="https://d4gc2024.s3.eu-central-1.amazonaws.com/data"
BASE_URL = "../data"
filenames=[
    "foodsavingleuven/places.csv",
    "foodsavingleuven/members.csv"	,
    "foodsavingleuven/activities.csv",
    "foodsavingleuven/activities_participants.csv",
    "foodsavingleuven/feedback.csv"
]

In [109]:
positive_sample = twitter_samples.strings('positive_tweets.json')
negative_sample = twitter_samples.strings('negative_tweets.json')

# How many samples do we have ?
# print(len(positive_sample)) # 5000
# print(len(negative_sample)) # 5000


# 1) Training and Testing Arrays setup
train_x = positive_sample[:4000] + negative_sample[:4000]
test_x = positive_sample[4000:] + negative_sample[4000:]

# Combine positive and negative labels
train_y = np.append(np.ones((len(positive_sample[:4000]), 1)), np.zeros((len(negative_sample[:4000]), 1)), axis=0)
test_y = np.append(np.ones((len(positive_sample[4000:]), 1)), np.zeros((len(negative_sample[4000:]), 1)), axis=0)


# 2) Train model using the training dataset and test it on the test dataset
freqs = sentiment_analysis.build_freqs(positive_sample, negative_sample)
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :] = sentiment_analysis.features_extraction(train_x[i], freqs)
# print(X)
"""
[[1.000e+00 3.764e+03 7.200e+01]
 [1.000e+00 4.464e+03 5.170e+02]
 [1.000e+00 3.759e+03 1.600e+02]
 ...
 [1.000e+00 1.840e+02 9.890e+02]
 [1.000e+00 2.560e+02 4.855e+03]
 [1.000e+00 2.400e+02 4.967e+03]]
"""

theta = sentiment_analysis.learn(X, train_y, 1e-9, 1000)
# print(theta)
"""
[[ 4.80560248e-08]
 [ 4.29616097e-04]
 [-4.54043371e-04]]
"""


# 3) Compute accuracy
# - y_hat    : y predicted
# - accuracy : accuracy value [0,1]
y_hat, accuracy = sentiment_analysis.compute_accuracy(test_x, test_y, freqs, theta)
# print(f"Accuracy : {accuracy}") # Accuracy : 0.997


# 4) Other custom tests
for str in ["Today is beautiful day!! :)",          # prediction = 1 (positive)
            "I am hopeless for the next week :(",   # prediction = 0 (negative)
            "That decision was a shame!"]:          # prediction = 0 (negative)
    print(f"{str} - {sentiment_analysis.predict(str, freqs, theta)}")


# 5) Confusion matrix plot
data = {
    'y_actual'    : test_y.reshape(len(y_hat)),
    'y_predicted' : y_hat
}
df = pd.DataFrame(data, columns=['y_actual', 'y_predicted'])
# confusion_matrix = pd.crosstab(df['y_actual'], df['y_predicted'], rownames=['Actual'], colnames=['Predicted'])
# sns.heatmap(confusion_matrix, annot=True)
# plt.show()

Today is beautiful day!! :) - 1
I am hopeless for the next week :( - 0
That decision was a shame! - 0


In [110]:
places_df=pd.read_csv(os.path.join(BASE_URL, filenames[0]),index_col=0)
places_df=places_df[["name","placeid"]]

activities_df=pd.read_csv(os.path.join(BASE_URL,filenames[2]),index_col=0)
activities_df=activities_df[["activity_id","placeid"]]

In [111]:
feedback_df=pd.read_csv(os.path.join(BASE_URL,filenames[4]),index_col=0)
feedback_df=feedback_df[["activity_id","feedback"]]

LOWER_BOUND = "0"
UPPER_BOUND = "1"
NEUTRAL_VAL = (int(UPPER_BOUND) + int(LOWER_BOUND)) / 2
feedback_df["feedback"] = feedback_df["feedback"].fillna("")
feedback_df["feedback"] = feedback_df["feedback"].apply(lambda x: "" if x == "" else UPPER_BOUND if sentiment_analysis.predict(x, freqs, theta) == 1 else LOWER_BOUND)
feedback_df["feedback"] = feedback_df["feedback"].replace("",NEUTRAL_VAL)
print(feedback_df.info())
print(feedback_df['feedback'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Index: 1595 entries, 0 to 1594
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   activity_id  1595 non-null   int64 
 1   feedback     1595 non-null   object
dtypes: int64(1), object(1)
memory usage: 37.4+ KB
None
feedback
1      697
0.5    488
0      410
Name: count, dtype: int64


In [112]:
activities_places = (activities_df.merge(places_df, on="placeid", how="left"))
activities_feedback = (activities_places.merge(feedback_df, on="activity_id", how="left"))

# group by placeid and sum the feedback
activities_feedback["feedback"] = activities_feedback["feedback"].astype(int)
activities_feedback = activities_feedback.drop(columns=["activity_id"])

# Group by placeid and calculate the average feedback
activities_feedback_avg = activities_feedback.groupby("placeid")["feedback"].mean()
print(activities_feedback_avg.sort_values(ascending=False).head(20))
print(activities_feedback_avg.info())

placeid
2188    1.000000
2759    0.600000
2642    0.600000
854     0.550877
2766    0.533333
2593    0.500000
858     0.493333
2210    0.465909
1480    0.407407
2091    0.400000
885     0.400000
1178    0.384615
760     0.379121
761     0.361111
1417    0.350000
871     0.321429
876     0.306122
843     0.183673
1947    0.142857
950     0.125000
Name: feedback, dtype: float64
<class 'pandas.core.series.Series'>
Index: 20 entries, 760 to 2766
Series name: feedback
Non-Null Count  Dtype  
--------------  -----  
20 non-null     float64
dtypes: float64(1)
memory usage: 320.0 bytes
None
