In [1]:
import pandas as pd
import re
import tweepy
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
airlines_csv = '../Resources/Airline-Sentiment-2-w-AA.csv'
airlines_df = pd.read_csv(airlines_csv, encoding = 'ISO-8859-1')
airlines_df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada)


In [3]:
features = airlines_df.iloc[:, 14].values
features
labels = airlines_df.iloc[:, 5].values
print(labels)

['neutral' 'positive' 'neutral' ... 'neutral' 'negative' 'neutral']


In [4]:
# cleaned the data by chaning to lowercase

clean_data = []                 
for feature in features:
        item = ' '.join(word.lower() for word in feature.split() \
            if not word.startswith('#') and \
            not word.startswith('@') and \
            not word.startswith('http') and \
            not word.startswith('RT'))
        
        if item == "" or item == "RT":
                continue
        clean_data.append(item)
        

In [5]:
# https://www.earthdatascience.org/courses/earth-analytics-python/using-apis-natural-language-processing-twitter/calculate-tweet-word-frequencies-in-python/
# used code from link above to remove all special characters 
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

cleaned_data_no_urls = [remove_url(tweet) for tweet in clean_data]
cleaned_data_no_urls[:10]

['what said',
 'plus youve added commercials to the experience tacky',
 'i didnt today must mean i need to take another trip',
 'its really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse',
 'and its a really big bad thing about it',
 'seriously would pay 30 a flight for seats that didnt have this playing its really the only bad thing about flying va',
 'yes nearly every time i fly vx this ear worm wont go away',
 'really missed a prime opportunity for men without hats parody there',
 'well i didntbut now i do d',
 'it was amazing and arrived an hour early youre too good to me']

In [6]:
# Encode the labels to numbers
sentiments = ['positive', 'negative', 'neutral']
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(sentiments)
list(le.classes_)
train_labels = le.transform(labels) 

Multinomial Naive Bayes

In [7]:
#create pipeline using Multinomial naive_bayes
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MLPClassifier()),
])

In [8]:
#set parameters to test
from sklearn.model_selection import GridSearchCV
parameters = {
#number of combined words for tokenization
'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
#remove words above a specified threshold (used in place of stop words)
'vect__max_df': (0.25, 0.5, 0.75, 1.0),
#include idf
'tfidf__use_idf': (True, False),
#smoothing parameter
'clf__hidden_layer_sizes': [(100,), (200,), (300,)],
'clf__activation': ('identity','logistic','tanh','relu'),
'clf__solver': ('lbfgs', 'sgd', 'adam'),
'clf__alpha':(0.0001, 0.001, 0.00001)}

In [9]:
#create gridsearch model
#gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False)
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs = -1, verbose = 1)

In [None]:
predictor = gs_clf.fit(cleaned_data_no_urls, train_labels)

Fitting 5 folds for each of 2592 candidates, totalling 12960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
print(f'Best Score: {gs_clf.best_score_}')                              

for param_name in sorted(parameters.keys()):
    print((param_name, gs_clf.best_params_[param_name]))

In [None]:
from joblib import dump, load
dump(predictor, 'Models/NeuralMLPClassifier.joblib')