### Name: Aditya Taori and Gurneet Chhabra
### Course Name: Introduction to Statistical Machine Learning
### Course Code: DSCC465
### Assignment Name: Kaggle Assignment

### Importing Libraries

In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import re
import random
from wordcloud import WordCloud
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

### Setting up input directories

In [2]:
input_dir = "E:/UOR Notes/Stats for Machine Learning/Assignments/Kaggle Assignment/"
training_input_file = "congressional_tweet_training_data.csv"
testing_input_file = "congressional_tweet_test_data.csv"
training_input_path = input_dir+training_input_file
testing_input_path = input_dir + testing_input_file
random.seed(265)  #Setting random seed

### Reading CSV files Tweets Data

In [3]:
cong_tweets_data = pd.read_csv(training_input_path)
cong_tweets_data.head()

Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,year,party_id
0,0,"b""RT @KUSINews: One of our longtime viewers wa...",KUSI,10,2017.0,R
1,258,"b""Today I'm urging the @CDCgov to immediately ...",Coronavirus,111,2020.0,R
2,0,"b'Tomorrow, #MO03 seniors graduate from Calvar...",MO03,2,2014.0,R
3,9,b'Congrats to #TeamUSA and Canton Native @JGre...,TeamUSA WorldJuniors,3,2017.0,R
4,3,b'Pleased to support @amergateways at their Ju...,ImmigrantHeritageMonth,3,2019.0,D


In [4]:
cong_tweets_data.isna().sum()

favorite_count        0
full_text             0
hashtags              0
retweet_count         0
year              18712
party_id              0
dtype: int64

In [None]:
cong_tweets_data = cong_tweets_data.dropna(subset = ["Cleaned_Text_WO_Stopwords"])

In [5]:
cong_testing_data = pd.read_csv(testing_input_path)
cong_testing_data.head()

Unnamed: 0,Id,favorite_count,full_text,hashtags,retweet_count,year,party
0,0,70,b'#TaxReform improved the playing field for Am...,TaxReform,13,2018.0,D
1,1,27,"b'This #NativeWomensEqualPay Day, we recommit ...",NativeWomensEqualPay,11,,D
2,2,49,"b""\xe2\x80\x9cI became convinced that our gene...",MeToo ShatteringTheSilence,24,2017.0,D
3,3,14,"b'During #NationalAdoptionMonth, we honor the ...",NationalAdoptionMonth,2,2019.0,D
4,4,13,b'Happy #AirborneDay to our @USArmy paratroope...,AirborneDay AirborneAllTheWay,7,2018.0,D


In [6]:
cong_testing_data.isna().sum()

Id                   0
favorite_count       0
full_text            0
hashtags             0
retweet_count        0
year              8347
party                0
dtype: int64

In [7]:
def clean_tweets(tweets_text,sw_rem):
    lemmatizer = WordNetLemmatizer()
    #print(tweets_text)
    #print("\n")
    tweets_text = re.sub("b['\"]","",tweets_text)
    tweets_text = re.sub('http\S+',"",tweets_text)
    #tweets_text = re.sub('#[A-Za-z0-9]*',"",tweets_text)
    tweets_text = re.sub('@[A-Za-z0-9]*',"",tweets_text)
    tweets_text = re.sub(r'[\\]n'," ",tweets_text)
    tweets_text = re.sub(r'[\\]x[a-f0-9][a-f0-9]',"",tweets_text)
    tweets_text = re.sub(r'(&amp;)|RT ',"",tweets_text)
    if sw_rem==1:
        cachedStopWords = stopwords.words("english")
        regexes= r'\b(' + r'|'.join(cachedStopWords) + r')\b\s*'
        tweets_text = re.sub(regexes,"",tweets_text.lower())
    new_string = tweets_text.translate(str.maketrans('', '', string.punctuation))
    tokenized_string = new_string.split(" ")
    without_digit = [i for i in tokenized_string if not i.isdigit()]
    no_small_words_text = [i for i in without_digit if len(i)>2]  #Removing short characters 
    lemmatized_string_list = [lemmatizer.lemmatize(word) for word in no_small_words_text if word!=""]
    lemmatized_string = " ".join(lemmatized_string_list)
    #print(new_string)
    #print("\n")
    return lemmatized_string

In [8]:
cong_tweets_data["Cleaned_Text"] = cong_tweets_data.apply(lambda x:clean_tweets(x['full_text'],0),axis=1)
cong_tweets_data

Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,year,party_id,Cleaned_Text
0,0,"b""RT @KUSINews: One of our longtime viewers wa...",KUSI,10,2017.0,R,One our longtime viewer wa Congressman office ...
1,258,"b""Today I'm urging the @CDCgov to immediately ...",Coronavirus,111,2020.0,R,Today urging the immediately launch phone hotl...
2,0,"b'Tomorrow, #MO03 seniors graduate from Calvar...",MO03,2,2014.0,R,Tomorrow MO03 senior graduate from Calvary Lut...
3,9,b'Congrats to #TeamUSA and Canton Native @JGre...,TeamUSA WorldJuniors,3,2017.0,R,Congrats TeamUSA and Canton Native winning the...
4,3,b'Pleased to support @amergateways at their Ju...,ImmigrantHeritageMonth,3,2019.0,D,Pleased support their June Fiesta which honore...
...,...,...,...,...,...,...,...
592798,3,"b'This time, it focused on careers in #publics...",publicservice publicsafety,0,2017.0,R,This time focused career publicservice and pub...
592799,5,"b'.#StormyDaniels, #MichaelWolfe, #JamesComey ...",StormyDaniels MichaelWolfe JamesComey,1,2018.0,R,StormyDaniels MichaelWolfe JamesComey Making t...
592800,33,b'@NRDems The American people deserve the trut...,CultureOfCorruption,14,2020.0,D,The American people deserve the truth and Cong...
592801,4,b'Only 2 weeks left to submit your #app to the...,app copolitics CAC16 HouseOfCode co06,3,2016.0,R,Only week left submit your app the Congression...


In [None]:
cong_tweets_data["Cleaned_Text_WO_Stopwords"].iloc[1871]

In [None]:
cong_tweets_data["Cleaned_Text_WO_Stopwords"] = cong_tweets_data.apply(lambda x:clean_tweets(x['full_text'],1),axis=1)
cong_tweets_data

In [None]:
len_clean_tweets = cong_tweets_data["Cleaned_Text"].apply(len)
cong_tweets_data["Cleaned_Tweets_Length"] = len_clean_tweets

In [None]:
len_clean_tweets_wo_sw = cong_tweets_data["Cleaned_Text_WO_Stopwords"].apply(len)
cong_tweets_data["Cleaned_Tweets_WO_Stopwords_Length"] = len_clean_tweets_wo_sw

In [None]:
cong_tweets_data.to_csv("Processed_Training_Data.csv")

In [9]:
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in column 'species'.
cong_tweets_data['encoded_party']= label_encoder.fit_transform(cong_tweets_data["party_id"])


In [10]:
cong_tweets_data['encoded_party']
label_encoder.classes_

array(['D', 'R'], dtype=object)

In [11]:
#X_train, X_test, y_train, y_test = train_test_split(cong_tweets_data["Cleaned_Text_WO_Stopwords"],cong_tweets_data["encoded_party"],test_size=0.2,shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(cong_tweets_data["Cleaned_Text"],cong_tweets_data["encoded_party"],test_size=0.2,shuffle=True)

In [12]:
tfidf_vectorizer1 = TfidfVectorizer(lowercase=True, ngram_range=(1, 1)) #object for ngram_range(1,1)
X_train_vectors_tfidf = tfidf_vectorizer1.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer1.transform(X_test)
print(X_train_vectors_tfidf.shape)
print(X_test_vectors_tfidf.shape)

(474242, 165538)
(118561, 165538)


In [13]:
nb_clf = MultinomialNB()
tfidf_vect_nb_model = nb_clf.fit(X_train_vectors_tfidf,y_train)
tfidf_vect_nb_predictions = tfidf_vect_nb_model.predict(X_test_vectors_tfidf)
tfidf_vect1_accuracy = accuracy_score(y_test,tfidf_vect_nb_predictions)
#y_prob = tfidf_vect_nb_model.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,tfidf_vect_nb_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf_vect_nb_predictions))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86     65025
           1       0.85      0.80      0.82     53536

    accuracy                           0.85    118561
   macro avg       0.85      0.84      0.84    118561
weighted avg       0.85      0.85      0.85    118561

Confusion Matrix: [[57724  7301]
 [10964 42572]]


In [20]:
lr_clf = LogisticRegression(solver = 'liblinear', C=20, penalty = 'l2',class_weight='balanced')
tfidf_vect_lr_model = lr_clf.fit(X_train_vectors_tfidf,y_train)
tfidf_vect_lr_predictions = tfidf_vect_lr_model.predict(X_test_vectors_tfidf)
tfidf_vect_lr_accuracy = accuracy_score(y_test,tfidf_vect_lr_predictions)
y_prob = tfidf_vect_lr_model.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,tfidf_vect_lr_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf_vect_lr_predictions))


              precision    recall  f1-score   support

           0       0.89      0.87      0.88     65025
           1       0.85      0.87      0.86     53536

    accuracy                           0.87    118561
   macro avg       0.87      0.87      0.87    118561
weighted avg       0.87      0.87      0.87    118561

Confusion Matrix: [[56520  8505]
 [ 6708 46828]]


In [None]:
svm_clf = SVC()
tfidf_vect_svm_model = svm_clf.fit(X_train_vectors_tfidf,y_train)
tfidf_vect_svm_predictions = tfidf_vect_svm_model.predict(X_test_vectors_tfidf)
tfidf_vect_svm_accuracy = accuracy_score(y_test,tfidf_vect_svm_predictions)
y_prob = tfidf_vect_svm_model.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,tfidf_vect_svm_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf_vect_svm_predictions))

In [None]:
rf_clf = RandomForestClassifier(n_estimators = 50, random_state = 42,max_depth=50,class_weight = "balanced")
tfidf_vect_rf_model = rf_clf.fit(X_train_vectors_tfidf,y_train)
tfidf_vect_rf_predictions = tfidf_vect_rf_model.predict(X_test_vectors_tfidf)
tfidf_vect_rf_accuracy = accuracy_score(y_test,tfidf_vect_rf_predictions)
y_prob = tfidf_vect_rf_model.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,tfidf_vect_rf_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf_vect_rf_predictions))

In [15]:
tfidf_vectorizer12 = TfidfVectorizer(lowercase=True, ngram_range=(1, 2)) #object for ngram_range(1,1)
X_train_vectors_tfidf_12 = tfidf_vectorizer12.fit_transform(X_train) 
X_test_vectors_tfidf_12 = tfidf_vectorizer12.transform(X_test)
print(X_train_vectors_tfidf_12.shape)
print(X_test_vectors_tfidf_12.shape)

(474242, 2363402)
(118561, 2363402)


In [16]:
nb_clf = MultinomialNB()
tfidf12_vect_nb_model = nb_clf.fit(X_train_vectors_tfidf_12,y_train)
tfidf12_vect_nb_predictions = tfidf12_vect_nb_model.predict(X_test_vectors_tfidf_12)
tfidf_vect1_accuracy = accuracy_score(y_test,tfidf12_vect_nb_predictions)
#y_prob = tfidf_vect_nb_model.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,tfidf12_vect_nb_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf12_vect_nb_predictions))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85     65025
           1       0.89      0.68      0.77     53536

    accuracy                           0.82    118561
   macro avg       0.83      0.80      0.81    118561
weighted avg       0.83      0.82      0.81    118561

Confusion Matrix: [[60327  4698]
 [17099 36437]]


In [17]:
lr_clf_12 = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
tfidf12_vect_lr_model = lr_clf_12.fit(X_train_vectors_tfidf_12,y_train)
tfidf12_vect_lr_predictions = tfidf12_vect_lr_model.predict(X_test_vectors_tfidf_12)
#tfidf_vect_lr_accuracy = accuracy_score(y_test,tfidf_vect_lr_predictions)
#y_prob_12 = tfidf12_vect_lr_model.predict_proba(X_test_vectors_tfidf_12)[:,1]
print(classification_report(y_test,tfidf12_vect_lr_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf12_vect_lr_predictions))

              precision    recall  f1-score   support

           0       0.88      0.89      0.88     65025
           1       0.86      0.85      0.86     53536

    accuracy                           0.87    118561
   macro avg       0.87      0.87      0.87    118561
weighted avg       0.87      0.87      0.87    118561

Confusion Matrix: [[57768  7257]
 [ 8105 45431]]


In [None]:
rf_clf_12 = RandomForestClassifier(n_estimators = 10, random_state = 42,class_weight = "balanced")
tfidf12_vect_rf_model = rf_clf_12.fit(X_train_vectors_tfidf_12,y_train)
tfidf12_vect_rf_predictions = tfidf12_vect_rf_model.predict(X_test_vectors_tfidf_12)
#tfidf12_vect_rf_accuracy = accuracy_score(y_test,tfidf_vect_rf_predictions)
#y_prob = tfidf_vect_rf_model.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,tfidf12_vect_rf_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf12_vect_rf_predictions))

In [None]:
tfidf_vectorizer13 = TfidfVectorizer(lowercase=True, ngram_range=(1, 3)) #object for ngram_range(1,1)
X_train_vectors_tfidf_13 = tfidf_vectorizer13.fit_transform(X_train) 
X_test_vectors_tfidf_13 = tfidf_vectorizer13.transform(X_test)
print(X_train_vectors_tfidf_13.shape)
print(X_test_vectors_tfidf_13.shape)

In [None]:
nb_clf_13 = MultinomialNB()
tfidf13_vect_nb_model = nb_clf_13.fit(X_train_vectors_tfidf_13,y_train)
tfidf13_vect_nb_predictions = tfidf13_vect_nb_model.predict(X_test_vectors_tfidf_13)
#tfidf_vect1_accuracy = accuracy_score(y_test,tfidf13_vect_nb_predictions)
#y_prob = tfidf_vect_nb_model.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,tfidf13_vect_nb_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf13_vect_nb_predictions))

In [None]:
lr_clf_13 = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
tfidf13_vect_lr_model = lr_clf_13.fit(X_train_vectors_tfidf_13,y_train)
tfidf13_vect_lr_predictions = tfidf13_vect_lr_model.predict(X_test_vectors_tfidf_13)
#tfidf_vect_lr_accuracy = accuracy_score(y_test,tfidf_vect_lr_predictions)
y_prob_13 = tfidf13_vect_lr_model.predict_proba(X_test_vectors_tfidf_13)[:,1]
print(classification_report(y_test,tfidf13_vect_lr_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf13_vect_lr_predictions))

In [None]:
rf_clf_13 = RandomForestClassifier(n_estimators = 10, random_state = 42,class_weight = "balanced")
tfidf13_vect_rf_model = rf_clf_13.fit(X_train_vectors_tfidf_13,y_train)
tfidf13_vect_rf_predictions = tfidf13_vect_rf_model.predict(X_test_vectors_tfidf_13)
#tfidf12_vect_rf_accuracy = accuracy_score(y_test,tfidf_vect_rf_predictions)
#y_prob = tfidf_vect_rf_model.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,tfidf13_vect_rf_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf13_vect_rf_predictions))

In [None]:
tfidf_vectorizer22 = TfidfVectorizer(lowercase=True, ngram_range=(2, 2)) #object for ngram_range(1,1)
X_train_vectors_tfidf_22 = tfidf_vectorizer22.fit_transform(X_train) 
X_test_vectors_tfidf_22 = tfidf_vectorizer22.transform(X_test)
print(X_train_vectors_tfidf_22.shape)
print(X_test_vectors_tfidf_22.shape)

In [None]:
nb_clf_22 = MultinomialNB()
tfidf22_vect_nb_model = nb_clf_22.fit(X_train_vectors_tfidf_22,y_train)
tfidf22_vect_nb_predictions = tfidf22_vect_nb_model.predict(X_test_vectors_tfidf_22)
#tfidf_vect1_accuracy = accuracy_score(y_test,tfidf13_vect_nb_predictions)
#y_prob = tfidf_vect_nb_model.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,tfidf22_vect_nb_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf22_vect_nb_predictions))

In [None]:
lr_clf_22 = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
tfidf22_vect_lr_model = lr_clf_22.fit(X_train_vectors_tfidf_22,y_train)
tfidf22_vect_lr_predictions = tfidf22_vect_lr_model.predict(X_test_vectors_tfidf_22)
#tfidf_vect_lr_accuracy = accuracy_score(y_test,tfidf_vect_lr_predictions)
#y_prob_13 = tfidf13_vect_lr_model.predict_proba(X_test_vectors_tfidf_13)[:,1]
print(classification_report(y_test,tfidf22_vect_lr_predictions))
print('Confusion Matrix:',confusion_matrix(y_test, tfidf22_vect_lr_predictions))

In [None]:
cong_testing_data["Cleaned_Text_WO_Stopwords"] = cong_testing_data.apply(lambda x:clean_tweets(x['full_text'],1),axis=1)
cong_testing_data

In [None]:
cong_testing_data.to_csv("Processed_Tesing_Data.csv")

In [None]:
test_data = tfidf_vectorizer1.transform(cong_testing_data["Cleaned_Text_WO_Stopwords"])
print(test_data.shape)

In [None]:
test_pedictions = tfidf_vect_lr_model.predict(test_data)
prediction_labels = label_encoder.inverse_transform(test_pedictions)

In [None]:
submissions_df = pd.DataFrame()
submissions_df["Id"] = cong_testing_data["Id"]
submissions_df["party"] = prediction_labels
submissions_df.to_csv("sample_submission_v1.csv",index=False)

In [None]:
clf = TruncatedSVD(1000)
clf.fit(X_train_vectors_tfidf)
pca_train_values = clf.transform(X_train_vectors_tfidf)
pca_test_values = clf.transform(X_test_vectors_tfidf)
pca_train_df = pd.DataFrame(pca_train_values)
#pca_train_df.columns = ["pca_dim_1","pca_dim_2"]
pca_test_df = pd.DataFrame(pca_train_values)

In [None]:
print(X_test_vectors_tfidf.shape)
print(X_train_vectors_tfidf.shape)

In [None]:
print(pca_train_df.shape)
print(pca_test_df.shape)

In [None]:
clf.explained_variance_ratio_