In [32]:
# import dependancies 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from sklearn import tree

In [12]:
# import the airline csv
airlines_csv = '../Resources/Airline-Sentiment-2-w-AA.csv'
airlines_df = pd.read_csv(airlines_csv, encoding = 'ISO-8859-1')
airlines_df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/2015 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/2015 11:35,5.7e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/2015 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/2015 11:15,5.7e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/2015 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/2015 11:15,5.7e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/2015 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/2015 11:15,5.7e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/2015 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/2015 11:14,5.7e+17,,Pacific Time (US & Canada)


## Cleaning the data
 * Remove username/speacial characters, spaces, numbers, and NAN rows
 * Found great code to use here, https://github.com/mertkahyaoglu/twitter-sentiment-analysis/blob/master/utils.py

In [13]:
# pull the columns that contain the tweets (features) and sentiment labels (labels).
features = airlines_df.iloc[:, 14].values
print(features)
labels = airlines_df.iloc[:, 5].values
print(labels)

['@VirginAmerica What @dhepburn said.'
 "@VirginAmerica plus you've added commercials to the experience... tacky."
 "@VirginAmerica I didn't today... Must mean I need to take another trip!"
 ... '@AmericanAir Please bring American Airlines to #BlackBerry10'
 "@AmericanAir you have my money, you change my flight, and don't answer your phones! Any other suggestions so I can make my commitment??"
 '@AmericanAir we have 8 ppl so we need 2 know how many seats are on the next flight. Plz put us on standby for 4 people on the next flight?']
['neutral' 'positive' 'neutral' ... 'neutral' 'negative' 'neutral']


In [14]:
# make sure all tweets are in lowercase and remove hashtags, mentions, and links.
clean_data = []                 
for feature in features:
        item = ' '.join(word.lower() for word in feature.split() \
            if not word.startswith('#') and \
            not word.startswith('@') and \
            not word.startswith('http') and \
            not word.startswith('RT'))
        
        if item == "" or item == "RT":
                continue
        clean_data.append(item)

In [23]:
# https://www.earthdatascience.org/courses/earth-analytics-python/using-apis-natural-language-processing-twitter/calculate-tweet-word-frequencies-in-python/
# used code from link above to remove all special characters 
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

cleaned_data_no_urls = [remove_url(tweet) for tweet in clean_data]
cleaned_data_no_urls[:10]

['what said',
 'plus youve added commercials to the experience tacky',
 'i didnt today must mean i need to take another trip',
 'its really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse',
 'and its a really big bad thing about it',
 'seriously would pay 30 a flight for seats that didnt have this playing its really the only bad thing about flying va',
 'yes nearly every time i fly vx this ear worm wont go away',
 'really missed a prime opportunity for men without hats parody there',
 'well i didntbut now i do d',
 'it was amazing and arrived an hour early youre too good to me']

# Train Test Split

In [34]:
# Split the data into training data and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cleaned_data_no_urls, labels, random_state=42)

In [40]:
# Encode the labels to numbers
sentiments = ['positive', 'negative', 'neutral']
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(sentiments)
# list(le.classes_)
train_labels = le.transform(labels) 
# test_labels = le.transform(y_test)

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import tree
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', tree.DecisionTreeClassifier())
])

In [42]:
#set parameters to test
from sklearn.model_selection import GridSearchCV
parameters = {
    #number of combined words for tokenization
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
    #remove words above a specified threshold (used in place of stop words)
    'vect__max_df': (0.25, 0.5, 0.75, 1.0),
    #include idf
    'tfidf__use_idf': (True, False)
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False)

In [43]:
predictor = gs_clf.fit(cleaned_data_no_urls, train_labels)

In [45]:
print(f'Best Score: {gs_clf.best_score_}')                              

for param_name in sorted(parameters.keys()):
    print((param_name, gs_clf.best_params_[param_name]))

Best Score: 0.6897549969016394
('tfidf__use_idf', False)
('vect__max_df', 0.25)
('vect__ngram_range', (1, 1))


# Add more paramaters

In [47]:
# set parameters to test
from sklearn.model_selection import GridSearchCV
parameters = {
    #number of combined words for tokenization
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
    #remove words above a specified threshold (used in place of stop words)
    'vect__max_df': (0.25, 0.5, 0.75, 1.0),
    #include idf
    'tfidf__use_idf': (True, False),
    'clf__criterion' : ('gini', 'entropy'),
    'clf__splitter' : ('best', 'random')
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False)

In [48]:
predictor = gs_clf.fit(cleaned_data_no_urls, train_labels)

In [49]:
print(f'Best Score: {gs_clf.best_score_}')                              

for param_name in sorted(parameters.keys()):
    print((param_name, gs_clf.best_params_[param_name]))

Best Score: 0.7010253034628284
('clf__criterion', 'gini')
('clf__splitter', 'random')
('tfidf__use_idf', False)
('vect__max_df', 0.25)
('vect__ngram_range', (1, 2))


In [52]:
from joblib import dump, load
dump(predictor, 'Models/DecisionTree.joblib')

['Models/DecisionTree.joblib']