In [2]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from tqdm import tqdm 
import nltk 
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import enchant
from pprint import pprint
import pickle
from sklearn.utils import resample

def import_tweet_data():
    """Imports the tweet data from the 'data' folder, 
    with ISO-8859-1 encoding.
    
    Output: A Pandas DataFrame"""
    
    df = pd.read_csv('data/TweetsOriginal.csv', encoding = 'ISO-8859-1' )
    return df

def encode_emotion_3(x): 
    x = x.lower() 
    if x == 'negative emotion': 
        return 0 
    elif x == 'no emotion toward brand or product': 
        return 2
    elif x == 'positive emotion': 
        return 1
    else: 
        return None
    
    
def encode_emotion_2(x): 
    x = x.lower() 
    if x == 'negative emotion': 
        return 0 
    elif x == 'positive emotion': 
        return 1
    else: 
        return None
    


def clean_split(split_type, df): 
    new_df = pd.DataFrame() 
    new_df['Text'] = df['tweet_text']
    new_df['Emotion'] = df['is_there_an_emotion_directed_at_a_brand_or_product']
    if split_type == 2: 
        new_df['Emotion_New'] = new_df.Emotion.map(encode_emotion_2)
    else: 
        new_df['Emotion_New'] = new_df.Emotion.map(encode_emotion_3)
    
    #dropping na in columns Text and Emotion
    new_df.dropna(subset = ['Text', 'Emotion_New'], inplace = True)
    
    #getting rid of @ symbols
    en_us = enchant.Dict("en_US")

    phrases = new_df.Text.values

    for i, phrase in enumerate(new_df.Text):
        phrases[i] = ' '.join(w for w in phrase.split() if en_us.check(w))

    new_df.Text = phrases
    
    word_tokenizer = RegexpTokenizer(r'\w+')
    tweet_token = TweetTokenizer()
    new_df.Text = new_df.Text.map(lambda x: tweet_token.tokenize(x.lower()))
    new_df.Text = new_df.Text.map(lambda x: ' '.join(x))
    new_df.Text= new_df.Text.map(lambda x: word_tokenizer.tokenize(x.lower()))
    new_df.Text = new_df.Text.map(lambda x: ' '.join(x))
    
    if split_type == 2:
        print('Original Value Counts')
        print(new_df.Emotion_New.value_counts())
        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        pos_df = new_df[new_df.Emotion_New == 1]
        neg_df = new_df[new_df.Emotion_New == 0]
        
        resample_pos = resample(pos_df, n_samples = 600, random_state = 10, replace = False)
        new_df = resample_pos.append(neg_df, ignore_index = True)
        print('Final Resampled Value Counts')
        print(new_df.Emotion_New.value_counts())
        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    
    else: 
        print('Original Value Counts')
        print(new_df.Emotion_New.value_counts())
        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
           

        pos_df = new_df[new_df.Emotion_New == 1]
        neg_df = new_df[new_df.Emotion_New == 0]
        neut_df = new_df[new_df.Emotion_New == 2]

        resample_pos = resample(pos_df, n_samples = 600, random_state = 10, replace = False)
        resample_neut = resample(neut_df, n_samples = 600, random_state = 10, replace = False)
        
        new_df = neg_df.append(resample_pos, ignore_index = True)
        new_df = new_df.append(resample_neut, ignore_index = True)
        print('Final Resampled Value Counts')
        print(new_df.Emotion_New.value_counts())
        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

    
    #split into test and trains
    x_train, x_test, y_train, y_test = train_test_split(new_df.Text, new_df.Emotion_New, stratify = new_df.Emotion_New,                                        
                                                        train_size = .85, random_state = 10)
    
    #removing stop words
    stop = stopwords.words('english')
    vectorizer = CountVectorizer(stop_words = stop, max_features = 5000, ngram_range=(1,3))
    clean_train = x_train.values
    clean_test = x_test.values

    train_features =vectorizer.fit_transform(clean_train).toarray()
    test_features = vectorizer.fit_transform(clean_test).toarray()
    
    
    
    #pickling
    pickle.dump(train_features, open(f'../Pickles/{split_type}_x_train.p', 'wb'))
    pickle.dump(test_features, open(f'../Pickles/{split_type}_x_test.p', 'wb'))
    pickle.dump(y_train, open(f'../Pickles/{split_type}_y_train.p', 'wb'))
    pickle.dump(y_test, open(f'../Pickles/{split_type}_y_test.p', 'wb'))
    
    print('Finished Pickling')
    
    
    return train_features, test_features, y_train, y_test



In [14]:
df = pd.read_csv('../1.3mTweets.csv', encoding = 'latin', header= None)
df['emotion'] = df[0]
df['tweet'] =  df[5]
df = df[['emotion', 'tweet']]


df.emotion = df.emotion.replace(4, 1)

In [17]:
df.emotion.value_counts()

1    800000
0    800000
Name: emotion, dtype: int64

In [20]:
from tqdm import tqdm 
en_us = enchant.Dict("en_US")

phrases = df.tweet.values
pbar = tqdm(enumerate(df.tweet), total = len(df.tweet))
for i, phrase in pbar:
    phrases[i] = ' '.join(w for w in phrase.split() if en_us.check(w))
    pbar.update(1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1600000/1600000 [59:52<00:00, 445.38it/s]


In [None]:
df.tweet = phrases

word_tokenizer = RegexpTokenizer(r'\w+')
tweet_token = TweetTokenizer()
df.tweet = df.tweet.map(lambda x: tweet_token.tokenize(x.lower()))
df.tweet = df.tweet.map(lambda x: ' '.join(x))
df.tweet= df.tweet.map(lambda x: word_tokenizer.tokenize(x.lower()))
df.tweet = df.tweet.map(lambda x: ' '.join(x))

In [None]:
df.to_csv('../Cleaned1.3m.csv', index = False)

In [None]:
df

In [None]:
x_train, x_test, y_train, y_test = train_test_split(new_df.Text, new_df.Emotion_New, stratify = new_df.Emotion_New,                                        
                                                        train_size = .85, random_state = 10)

#removing stop words
stop = stopwords.words('english')
vectorizer = CountVectorizer(stop_words = stop, max_features = 5000, ngram_range=(1,3))
clean_train = x_train.values
clean_test = x_test.values

train_features =vectorizer.fit_transform(clean_train).toarray()
test_features = vectorizer.fit_transform(clean_test).toarray()



#pickling
pickle.dump(train_features, open(f'../Pickles/test_x_train.p', 'wb'))
pickle.dump(test_features, open(f'../Pickles/test_x_test.p', 'wb'))
pickle.dump(y_train, open(f'../Pickles/test_y_train.p', 'wb'))
pickle.dump(y_test, open(f'../Pickles/test_y_test.p', 'wb'))

In [None]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC,LinearSVC
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import os
import pickle
import pandas as pd
from sklearn.utils import resample

pd.options.display.max_rows = 35 
pd.options.display.max_columns = None

In [None]:
def get_pickles(split_type): 
    x_train = pickle.load(open(f'../Pickles/{split_type}_x_train.p', 'rb'))
    x_test = pickle.load(open(f'../Pickles/{split_type}_x_test.p', 'rb'))
    y_train = pickle.load(open(f'../Pickles/{split_type}_y_train.p', 'rb'))
    y_test = pickle.load(open(f'../Pickles/{split_type}_y_test.p', 'rb'))
    
    
    print('Train Value Counts')
    print(y_train.value_counts())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('Test Value Counts')
    print(y_test.value_counts())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~')
    return x_train, x_test, y_train, y_test


x_train, x_test, y_train, y_test = get_pickles(test)
print(x_train.shape, x_test.shape) 
print(y_train.shape, y_test.shape)

### Resampling
- 0 = negative
- 1 = Positive 
- 2 = neutral

In [None]:
models = {'Log': LogisticRegression(), 'Knn': KNeighborsClassifier(), 
          'DT': DecisionTreeClassifier(random_state = 10), 'Gaussian': GaussianNB(), 'LDA': LinearDiscriminantAnalysis(),
          'LinearSVC': LinearSVC(max_iter = 1250, random_state = 10), 'SDGSVC': SGDClassifier(random_state = 10),  
          'ADA': AdaBoostClassifier(random_state = 10), 'Bagging': BaggingClassifier(random_state = 10), 
          'Ridge': RidgeClassifier(random_state = 10), 
          'RF': RandomForestClassifier(random_state = 10)}

#create stacked model
stack_m = [] 
for model, m in models.items(): 
    stack_m.append((model, m))
stack_model = StackingClassifier(estimators = stack_m, final_estimator = LogisticRegression(), cv = 5)
models['stacked'] = stack_model

#test each model and stacking
results = []
model_names = []
pbar = tqdm(models.items())
for model, m in pbar: 
    pbar.set_description(f'Evaluating {model.upper()}')
    cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 5)
    scores = cross_val_score(m, x_train, y_train, scoring = 'accuracy', cv = cv, n_jobs = 13, 
                             error_score = 'raise')
    results.append(scores)
    model_names.append(model)

In [None]:
vanilla_dict = {i:y for i,y in zip(model_names, results)}
pickle.dump(vanilla_dict, open('models/VanillaResults.p', 'wb'))

In [None]:
plt.figure(figsize = (10,8))
plt.boxplot(results, labels = model_names, showmeans = True)
plt.title('Accuracy for Each Vanilla Model (Version 1)')
plt.ylabel('Accuracy'); plt.xlabel('Model')
plt.savefig('figures/BaselineAccuracy.png')
plt.show()

In [None]:
models = {'Log': LogisticRegression(), 
          'Gaussian': GaussianNB(), 
          'LinearSVC': LinearSVC(max_iter = 1250, random_state = 10), 'SDGSVC': SGDClassifier(random_state = 10),  
          'Ridge': RidgeClassifier(random_state = 10), 'RF': RandomForestClassifier(random_state = 10)}

#create stacked model
stack_m = [] 
for model, m in models.items(): 
    stack_m.append((model, m))
stack_model = StackingClassifier(estimators = stack_m, final_estimator = LogisticRegression(), cv = 5)
models['stacked'] = stack_model

#test each model and stacking
results = []
model_names = []
pbar = tqdm(models.items())
for model, m in pbar: 
    pbar.set_description(f'Evaluating {model.upper()}')
    cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 5)
    scores = cross_val_score(m, x_train, y_train, scoring = 'accuracy', cv = cv, n_jobs = -1, 
                             error_score = 'raise')
    results.append(scores)
    model_names.append(model)

In [None]:
vanilla_dict = {i:y for i,y in zip(model_names, results)}
pickle.dump(vanilla_dict, open('models/VanillaResults2.p', 'wb'))

In [None]:
plt.figure(figsize = (10,8))
plt.boxplot(results, labels = model_names, showmeans = True)
plt.title('Accuracy for Each Vanilla Model (Version 2)')
plt.ylabel('Accuracy'); plt.xlabel('Model')
plt.savefig('figures/BaselineAccuracy2.png')
plt.show()

In [None]:
models = {'Log': LogisticRegression(), 
          'Gaussian': GaussianNB(), 
          'LinearSVC': LinearSVC(max_iter = 1250, random_state = 10), 'Ridge': RidgeClassifier(random_state = 10), 
          'RF': RandomForestClassifier(random_state = 10)}

#create stacked model
stack_m = [] 
for model, m in models.items(): 
    stack_m.append((model, m))
stack_model = StackingClassifier(estimators = stack_m, final_estimator = LogisticRegression(), cv = 5)
models['stacked'] = stack_model

#test each model and stacking
results = []
model_names = []
pbar = tqdm(models.items())
for model, m in pbar: 
    pbar.set_description(f'Evaluating {model.upper()}')
    cv = RepeatedStratifiedKFold(n_splits = 7, n_repeats = 10)
    scores = cross_val_score(m, x_train, y_train, scoring = 'accuracy', cv = cv, n_jobs = -1, 
                             error_score = 'raise')
    results.append(scores)
    model_names.append(model)

In [None]:
vanilla_dict = {i:y for i,y in zip(model_names, results)}
pickle.dump(vanilla_dict, open('models/VanillaResults3.p', 'wb'))

In [None]:
plt.figure(figsize = (10,8))
plt.boxplot(results, labels = model_names, showmeans = True)
plt.title('Accuracy for Each Vanilla Model (Version 3)')
plt.ylabel('Accuracy'); plt.xlabel('Model')
plt.savefig('figures/BaselineAccuracy2.png')
plt.show()