## Political Party classification 

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import sklearn
import spacy
import seaborn as sns 
import re
import nltk
import wordcloud
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'sans-serif' 
plt.rcParams['font.serif'] = 'Ubuntu' 
plt.rcParams['font.monospace'] = 'Ubuntu Mono' 
plt.rcParams['font.size'] = 14 
plt.rcParams['axes.labelsize'] = 12 
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 12 
plt.rcParams['xtick.labelsize'] = 12 
plt.rcParams['ytick.labelsize'] = 12 
plt.rcParams['legend.fontsize'] = 12 
plt.rcParams['figure.titlesize'] = 12 
plt.rcParams['image.cmap'] = 'jet' 
plt.rcParams['image.interpolation'] = 'none' 
plt.rcParams['figure.figsize'] = (20, 20) 
plt.rcParams['axes.grid']=False
import string
from sklearn.naive_bayes import MultinomialNB

plt.rcParams['lines.linewidth'] = 2 
plt.rcParams['lines.markersize'] = 8
colors = ['xkcd:pale orange', 'xkcd:sea blue', 'xkcd:pale red', 'xkcd:sage green', 'xkcd:terra cotta', 'xkcd:dull purple', 'xkcd:teal', 'xkcd: goldenrod', 'xkcd:cadet blue',
'xkcd:scarlet']

In [2]:
data = pd.read_csv('ExtractedTweets.csv')
print('The total number of tweet is ' + str(len(data)))

FileNotFoundError: [Errno 2] No such file or directory: 'ExtractedTweets.csv'

In [None]:
data = data.sample(frac=1).reset_index().drop(columns=['index','Handle'])
data = data.loc[0:5000]

In [None]:
data.head()

In [None]:
DEM = len(data[data.Party=='Democrat'])
REP = len(data[data.Party=='Republican'])


In [None]:
explode = (0, 0.1)
labels = ['Democrats Tweets','Republican Tweets']
plt.pie([DEM,REP], explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90,colors=['red','blue'])

In [None]:

features = data.Tweet.tolist()
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = re.sub('https://t.co','',processed_feature)
    processed_feature = re.sub('https','',processed_feature)
    processed_feature = re.sub(' co ','',processed_feature)
    processed_feature = re.sub('amp','',processed_feature)

    
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [None]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [None]:
data['Tweet'] = processed_features
data['Tweet'] =data['Tweet'].apply(lambda x: remove_punct(x))
data.head(10)

In [None]:
dem_data = data[data['Party']=='Democrat']
rep_data = data[data['Party']=='Republican']
dem_tweet_list = dem_data.Tweet.tolist()
dem_text = dem_data.Tweet.tolist()[0]
for d in dem_tweet_list[1::]:
    dem_text = dem_text + d 
rep_tweet_list = rep_data.Tweet.tolist()
rep_text = rep_data.Tweet.tolist()[0]
for d in rep_tweet_list[1::]:
    rep_text = rep_text + d 
import wordcloud
wordcloud_dem = wordcloud.WordCloud().generate(dem_text)
wordcloud_rep = wordcloud.WordCloud().generate(rep_text)
plt.rcParams['figure.figsize']=(20,20)
plt.subplot(1,2,1)
plt.title('Democrats Word-Cloud',color='navy',fontsize=20)
plt.imshow(wordcloud_dem)
plt.subplot(1,2,2)
plt.title('Republicans Word-Cloud',color='red',fontsize=20)
plt.imshow(wordcloud_rep)

__Note: American/America, house,student,support, Trump/President__

In [None]:
processed_features = data.iloc[:,1].values
labels = data.iloc[:, 0].values
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)
processed_features = vectorizer.fit_transform(processed_features).toarray()

In [None]:
N_ESTIMATOR = [20,50,100,200,500]
ACC = []
for n in N_ESTIMATOR:
    text_classifier = RandomForestClassifier(n_estimators=n, random_state=0)
    text_classifier.fit(X_train, y_train)
    predictions = text_classifier.predict(X_test)
    ACC.append(accuracy_score(y_test, predictions))

In [None]:
plt.rcParams['figure.figsize'] = (10, 10) 
plt.plot(N_ESTIMATOR,ACC,color='navy')
plt.grid(True)
plt.xlabel('# Estimator')
plt.ylabel('Accuracy')
plt.title('Estimator Number Hyperparameter Tuning',fontsize=20)

In [None]:
opt_n = N_ESTIMATOR[np.array(ACC).argmax()]

In [None]:
DEPTH = np.arange(10,200,10)
ACC = []
for n in DEPTH:
    text_classifier = RandomForestClassifier(n_estimators=opt_n, max_depth=n ,random_state=0)
    text_classifier.fit(X_train, y_train)
    predictions = text_classifier.predict(X_test)
    ACC.append(accuracy_score(y_test, predictions))

In [None]:
opt_depth = DEPTH[np.array(ACC).argmax()]

In [None]:
plt.plot(DEPTH,ACC,color='navy')
plt.grid(True)
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.title('Depth Hyperparameter Tuning',fontsize=20)

In [None]:
CRITERIONS = ['gini','entropy']
ACC= []
for c in CRITERIONS:
    text_classifier = RandomForestClassifier(n_estimators=opt_n, max_depth=opt_depth , criterion=c, random_state=0)
    text_classifier.fit(X_train, y_train)
    predictions = text_classifier.predict(X_test)
    ACC.append(accuracy_score(y_test, predictions))

In [None]:
plt.title('Criterion Hyperparameter Tuning',fontsize=20)
sns.barplot(y=ACC,x=['Gini','Entropy'],palette='plasma')
plt.xlabel('Criterion')
plt.ylabel('Accuracy')
plt.grid(True)
plt.ylim(0.60,0.68)

In [None]:
def sample_data():
    data = pd.read_csv('ExtractedTweets.csv')
    data = data.sample(frac=1).reset_index().drop(columns=['index','Handle'])
    data = data.loc[0:5000]
    features = data.Tweet.tolist()
    processed_features = []

    for sentence in range(0, len(features)):
        # Remove all the special characters
        processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

        # remove all single characters
        processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

        # Remove single characters from the start
        processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

        # Substituting multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

        # Removing prefixed 'b'
        processed_feature = re.sub(r'^b\s+', '', processed_feature)

        # Converting to Lowercase
        processed_feature = re.sub('https://t.co','',processed_feature)
        processed_feature = re.sub('https','',processed_feature)
        processed_feature = re.sub(' co ','',processed_feature)
        processed_feature = re.sub('amp','',processed_feature)


        processed_feature = processed_feature.lower()

        processed_features.append(processed_feature)
    data['Tweet'] = processed_features
    data['Tweet'] =data['Tweet'].apply(lambda x: remove_punct(x))
    data.head(10)
    return data

In [None]:
def hyperparameter_tuning(data):
    processed_features = data.iloc[:,1].values
    labels = data.iloc[:, 0].values
    vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)
    processed_features = vectorizer.fit_transform(processed_features).toarray()
    X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.1, random_state=0)
    N_ESTIMATOR = [20,50,100,200]
    ACC = []
    for n in N_ESTIMATOR:
        text_classifier = RandomForestClassifier(n_estimators=n, random_state=0)
        text_classifier.fit(X_train, y_train)
        predictions = text_classifier.predict(X_test)
        ACC.append(accuracy_score(y_test, predictions))
    opt_n = N_ESTIMATOR[np.array(ACC).argmax()]
    print('Optimal Number of Estimator found! \n')
    DEPTH = np.arange(10,150,50)
    ACC = []
    for n in DEPTH:
        text_classifier = RandomForestClassifier(n_estimators=opt_n, max_depth=n ,random_state=0)
        text_classifier.fit(X_train, y_train)
        predictions = text_classifier.predict(X_test)
        ACC.append(accuracy_score(y_test, predictions))
    opt_d = DEPTH[np.array(ACC).argmax()]
    print('Optimal Depth found! \n')
    CRITERIONS = ['gini','entropy']
    ACC= []
    for c in CRITERIONS:
        text_classifier = RandomForestClassifier(n_estimators=opt_n, max_depth=opt_depth , criterion=c, random_state=0)
        text_classifier.fit(X_train, y_train)
        predictions = text_classifier.predict(X_test)

        ACC.append(accuracy_score(y_test, predictions))
    opt_c = CRITERIONS[np.array(ACC).argmax()]
    print('Optimal criterion found! \n')
    return [opt_n,opt_d,opt_c]

In [None]:
data = sample_data()

In [None]:
CV_N = []
CV_D = []
CV_C = []
for i in range(10):
    data = sample_data()
    ht = hyperparameter_tuning(data)
    CV_N.append(ht[0])
    CV_D.append(ht[1])
    CV_C.append(ht[2])

In [None]:
plt.subplot(3,1,1)
plt.title('Chosen Number of Estimators')
sns.countplot(CV_N,palette='plasma')
plt.subplot(3,1,2)
plt.title('Chosen Depth')
sns.countplot(CV_D,palette='plasma')
plt.subplot(3,1,3)
plt.title('Criterion')
sns.countplot(CV_C,palette='plasma')

In [None]:
OPT_P = [100,60,'gini']

In [None]:
def opt_run(data,hyper_parameters,plot_results, print_results):
    processed_features = data.iloc[:,1].values
    labels = data.iloc[:, 0].values
    vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)
    processed_features = vectorizer.fit_transform(processed_features).toarray()
    X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.1, random_state=0)
    opt_n = hyper_parameters[0]
    opt_depth = hyper_parameters[1]
    opt_c = hyper_parameters[2]
    text_classifier = RandomForestClassifier(n_estimators=opt_n, max_depth=opt_depth , criterion=opt_c, random_state=0)
    text_classifier.fit(X_train, y_train)
    predictions = text_classifier.predict(X_test)
    if print_results==True:
        print('Confusion Metrics: \n')
        print(confusion_matrix(y_test,predictions))
        print(classification_report(y_test,predictions))
        print('ACCURACY ', accuracy_score(y_test, predictions))
    if plot_results==True:
        from sklearn.metrics import plot_confusion_matrix
        disp = plot_confusion_matrix(text_classifier, X_test, y_test,
                             cmap=plt.cm.Blues)
    return accuracy_score(y_test, predictions)

In [None]:
opt_run(sample_data(),OPT_P,plot_results=True,print_results=True)

In [None]:
ACC = []
for i in range(50):
    data = sample_data()
    processed_features = data.iloc[:,1].values
    labels = data.iloc[:, 0].values
    vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)
    processed_features = vectorizer.fit_transform(processed_features).toarray()
    X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.1, random_state=0)
    clf = MultinomialNB()
    clf.fit(X_train,y_train)
    predictions = clf.predict(X_test)
    ACC.append(accuracy_score(y_test, predictions))

In [None]:
ACC = np.array(ACC)

In [None]:
print('The Multinomial NB classifier permitted to obtain the following accuracy %.2f' %(ACC.mean()))

In [None]:
plt.title('Multinomial NB Accuracy',color='red')
sns.distplot(ACC,color='darkorange',bins=5)
plt.xlabel('Accuracy')
plt.ylabel('Counts')
plt.grid(True)

In [None]:
def important_features(n=20):
    data = sample_data()
    processed_features = data.iloc[:,1].values
    labels = data.iloc[:, 0].values
    vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)
    processed_features = vectorizer.fit_transform(processed_features).toarray()
    X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.1, random_state=0)
    clf = MultinomialNB()
    clf.fit(X_train,y_train)
    predictions = clf.predict(X_test)
    feature_names = vectorizer.get_feature_names()
    class_labels = clf.classes_
    topn_class1 = sorted(zip(clf.feature_count_[0], feature_names),reverse=True)[20:n]
    topn_class2 = sorted(zip(clf.feature_count_[1], feature_names),reverse=True)[20:n]

    print("Important words in Democrat")

    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)

    print("-----------------------------------------")
    print("Important words in Republicans")

    for coef, feat in topn_class2:
        print(class_labels[1], coef, feat)

In [None]:
important_features(n=40)

House/Trump/Bill/Will/Tax/Happy/Congress

In [None]:
data = sample_data()
processed_features = data.iloc[:,1].values
labels = data.iloc[:, 0].values
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)
processed_features = vectorizer.fit_transform(processed_features).toarray()
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.1, random_state=0)
clf = SVC()
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
print('ACCURACY ', accuracy_score(y_test, predictions))


In [None]:
plt.rcParams['figure.figsize'] = (10, 10) 
from sklearn.metrics import plot_confusion_matrix
disp = plot_confusion_matrix(clf, X_test, y_test,
                     cmap='plasma')
plt.xticks([0,1],['Democrats','Republicans'])
plt.yticks([0,1],['Democrats','Republicans'])



In [None]:
import jovian
jovian.commit()