In [1]:
import func_user_tweet_classifier as utc
import func_user_sentiment_analysis as usa

# Sentiment Analysis 

In [2]:
df = usa.preprocess_data(labels_file='data/labels.csv',
                         tweets_file='data/cleaned_tweets.csv',
                         output_file='data/labels.csv')
display(df.head())

Unnamed: 0,user_id,label,subjectivity
0,8436472,b,1
1,1310040047421403136,t,1
2,210165301,t,1
3,1220517169961820160,t,-1
4,21906070,b,-1


# Data Cluster

### Merging Datasets

In [3]:
labels_file = 'data/labels.csv'
tweets_file = 'data/cleaned_tweets.csv'

merged_df = utc.load_and_merge_data(labels_file, tweets_file)
merged_df = utc.convert_labels(merged_df, 'label')
merged_df

Unnamed: 0,user_id,cleaned_tweets,label,subjectivity,all_tweets
0,8.436472e+06,"Trump: As a student I used to hear for years, ...",1,1,"Trump: As a student I used to hear for years, ..."
1,1.310040e+18,tedcruz cc125 Trump2020 BLM obama VoteRed...,-1,1,tedcruz cc125 Trump2020 BLM obama VoteRed...
2,2.101653e+08,DavidJHarrisJr DemsAreCorrupt Trump FourMor...,-1,1,DavidJHarrisJr DemsAreCorrupt Trump FourMor...
3,1.220517e+18,NorthCarolina ncpol Trump CBP USBP Stoppi...,-1,-1,NorthCarolina ncpol Trump CBP USBP Stoppi...
4,2.190607e+07,"‘Totally under control’: New, secretly- film...",1,-1,"‘Totally under control’: New, secretly- film..."
...,...,...,...,...,...
95,2.154027e+09,First executive order of the reelected preside...,-1,1,First executive order of the reelected preside...
96,1.599926e+07,GeoffRBennett NBCNightlyNews LesterHoltNBC ...,1,-1,GeoffRBennett NBCNightlyNews LesterHoltNBC ...
97,8.370226e+17,How a 2nd-Grade Class Sent a Science Experimen...,1,-1,How a 2nd-Grade Class Sent a Science Experimen...
98,1.169288e+18,BarackObama is hitting the campaign trail for ...,1,1,BarackObama is hitting the campaign trail for ...


### Train, Test, Evaluate Model

In [4]:
(X_train, X_test, Y_train, Y_test) = utc.split_dataset(merged_df)

(X_train_tfidf, X_test_tfidf, trained_vectorizer) = \
    utc.vectorize_text(X_train, X_test)

(X_train_reduced, X_test_reduced, trained_svd) = \
    utc.reduce_dimensions(X_train_tfidf, X_test_tfidf)

(trained_model, Y_pred) = utc.train_and_predict(X_train_reduced,
        Y_train, X_test_reduced)
utc.evaluate_predictions(Y_test, Y_pred)

Label: label
              precision    recall  f1-score   support

          -1       1.00      0.21      0.35        19
           1       0.80      1.00      0.89        61

    accuracy                           0.81        80
   macro avg       0.90      0.61      0.62        80
weighted avg       0.85      0.81      0.76        80

Accuracy: 0.8125


### Prediction

In [5]:
predicted_new_data = utc.predict_new_data(tweets_file,
        trained_vectorizer, trained_svd, trained_model)
predicted_new_data = predicted_new_data[[
    'user_id',
    'cleaned_tweets',
    'label',
    'subjectivity',
    ]]
predicted_new_data['label'] = predicted_new_data['label'
        ].replace({1: 'Biden', -1: 'Trump'})
display(predicted_new_data.head())

Unnamed: 0,user_id,cleaned_tweets,label,subjectivity
0,360666500.0,Elecciones2020 | En Florida: JoeBiden dice q...,Biden,1
1,8436472.0,"Trump: As a student I used to hear for years, ...",Biden,-1
2,8.283556e+17,2 hours since last tweet from Trump! Maybe he...,Trump,1
3,47413800.0,You get a tie! And you get a tie! Trump ‘s ra...,Trump,1
4,1138416000.0,CLady62 Her 15 minutes were over long time ago...,Trump,1


# Converting

In [6]:
predict_df = predicted_new_data[['user_id','label','cleaned_tweets', 'subjectivity']]
predict_df['year'] = 2020
predict_df = predict_df.rename(columns={'cleaned_tweets':'tweet'})
predict_df = predict_df[predict_df['user_id'].notna()]
predict_df['user_id'] = predict_df['user_id'].astype(int)
predict_df.head()

Unnamed: 0,user_id,label,tweet,subjectivity,year
0,360666534,Biden,Elecciones2020 | En Florida: JoeBiden dice q...,1,2020
1,8436472,Biden,"Trump: As a student I used to hear for years, ...",-1,2020
2,828355589206056960,Trump,2 hours since last tweet from Trump! Maybe he...,1,2020
3,47413798,Trump,You get a tie! And you get a tie! Trump ‘s ra...,1,2020
4,1138416104,Trump,CLady62 Her 15 minutes were over long time ago...,1,2020


In [7]:
parquet_file = 'data/labels.parquet'
predict_df.to_parquet(parquet_file)