import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, roc_auc_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from xgboost.sklearn import XGBClassifier
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
data_raw = pd.read_csv('../input/us-election-2020-presidential-debates/us_election_2020_1st_presidential_debate.csv', engine='python')

In [None]:
data_raw.head(5)

In [None]:
wallace_mask = data_raw['speaker'] == 'Chris Wallace'
data_raw.drop(data_raw[wallace_mask].index, inplace=True)
data_raw.dropna(inplace=True)
data_raw.info()

In [None]:
def most_common_words(name, data, number_of_words):
    candidate_mask = data['speaker'] == name
    candidate_data = data[candidate_mask]

    candidate_text = ''
    for i in candidate_data['text']:
        candidate_text = candidate_text + i.lower() + ' '
    
    stopset = stopwords.words('english')
    stopwords_to_add = ['.', ',', '’', '?', '[', ']', '…', 'going', 'want', 'know', 'look', 'would', 'said', 'got', 'think', 'say', 'tell', 'go', 'get', 'crosstalk', 'well', 'like', 'much', 'make']
    for i in stopwords_to_add:
        stopset.append(i)

    candidate_text_tokenized = word_tokenize(candidate_text)
    candidate_text_final = [i for i in candidate_text_tokenized if i not in stopset]

    candidate_counter = Counter(candidate_text_final)
    candidate_counter_words = candidate_counter.most_common(number_of_words)
    
    df = pd.DataFrame(candidate_counter_words, columns = ['Word', 'Frequency'])
    
    return df

In [None]:
df_trump = most_common_words('President Donald J. Trump', data_raw, 10)
df_trump

In [None]:
df_biden = most_common_words('Vice President Joe Biden', data_raw, 10)
df_biden

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6), sharey=True)
sns.barplot(data=df_trump, x='Word', y='Frequency', palette='rocket', ax=ax[0]).set_title('Donald Trump')
sns.barplot(data=df_biden, x='Word', y='Frequency', palette='mako', ax=ax[1]).set_title('Joe Biden')
ax[0].tick_params(labelrotation=45)
ax[1].tick_params(labelrotation=45)
fig.savefig('test2.jpg', bbox_inches='tight')

In [None]:
stop_words=stopwords.words('english')

X = data_raw['text']
y = data_raw['speaker']

print(y.value_counts(normalize=True))
y.value_counts()

In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [None]:
le.classes_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, stratify=y_encoded, random_state=1)

tf_idf = TfidfVectorizer(stop_words=stop_words, lowercase=True)

bag_of_words_train = tf_idf.fit_transform(X_train)
bag_of_words_train_features = tf_idf.get_feature_names()
bag_of_words_train_df = pd.DataFrame(bag_of_words_train.toarray(), columns=bag_of_words_train_features)

bag_of_words_test = tf_idf.transform(X_test)
bag_of_words_test_features = tf_idf.get_feature_names()
bag_of_words_test_df = pd.DataFrame(bag_of_words_test.toarray(), columns=bag_of_words_test_features)

Multinomial Naive Bayes

In [None]:
kfold = StratifiedKFold(random_state=1, shuffle=True)

parameters_mnb = {  
'alpha': np.linspace(0.01, 1, 10)  
}

mnb = MultinomialNB()
grid_search_mnb = GridSearchCV(mnb, param_grid=parameters_mnb, scoring='accuracy')
grid_search_mnb.fit(bag_of_words_train_df, y_train)

In [None]:
y_pred_class_mnb = grid_search_mnb.predict(bag_of_words_test_df)
y_pred_score_classes_mnb = grid_search_mnb.predict_proba(bag_of_words_test_df)
y_pred_score_mnb = y_pred_score_classes_mnb[:, 1]

classif_report_mnb = classification_report(y_test, y_pred_class_mnb)
print(classif_report_mnb)

In [None]:
confusion_matrix(y_test, y_pred_class_mnb)

Random Forest

In [None]:
rfc = RandomForestClassifier()
rfc.fit(bag_of_words_train_df, y_train)

y_pred_class_rfc = rfc.predict(bag_of_words_test_df)
y_pred_score_classes_rfc = rfc.predict_proba(bag_of_words_test_df)
y_pred_score_rfc = y_pred_score_classes_rfc[:, 1]

In [None]:
classif_report = classification_report(y_test, y_pred_class_rfc)
print(classif_report)

In [None]:
confusion_matrix(y_test, y_pred_class_rfc)

In [None]:
accuracy_score(y_test, y_pred_class_rfc)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_score_rfc)

print(roc_auc_score(y_test, y_pred_score_rfc))

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

Extreme Gradient Boosting

In [None]:
params_xgb = {  
    "n_estimators": [50,100,500],
    "max_depth": [1,2,3,4],
    "learning_rate": [0.01, 0.1],
}

xgb = XGBClassifier(n_jobs=-1)
grid_search_xgb = GridSearchCV(xgb, param_grid=params_xgb, cv=kfold, verbose=1, n_jobs=-1)
grid_search_xgb.fit(bag_of_words_train_df, y_train)

In [None]:
y_pred_class_xgb = grid_search_xgb.predict(bag_of_words_test_df)
y_pred_score_classes_xgb = grid_search_xgb.predict_proba(bag_of_words_test_df)
y_pred_score_xgb = y_pred_score_classes_xgb[:, 1]

classif_report = classification_report(y_test, y_pred_class_xgb)
print(classif_report)

In [None]:
confusion_matrix(y_test, y_pred_class_xgb)