In [1]:
import pandas as pd
import numpy as np
import re
# from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [27]:
# print(train['negativereason_gold'].nunique())
# print(train['negativereason_gold'].value_counts(),"\n")

# print(x_test.negativereason_gold.nunique())
# print(x_test.negativereason_gold.value_counts())

## Cleaning

In [40]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [79]:
train = pd.read_csv("twitter_train.csv") #10980 rows 12 cols
test = pd.read_csv("twitter_test.csv")

drop_cols = ['airline_sentiment_gold','name','tweet_id', 'retweet_count','tweet_created','user_timezone','tweet_coord','tweet_location']
train.drop(drop_cols, axis = 1, inplace=True)
test.drop(drop_cols, axis = 1, inplace=True)

stops = stopwords.words('english')
stops += list(punctuation)
stops += ['flight','airline','flights','AA']

In [80]:
abbreviations = {'ppl': 'people','cust':'customer','serv':'service','mins':'minutes','hrs':'hours','svc': 'service',
           'u':'you','pls':'please'}

train_index = train[~train.negativereason_gold.isna()].index
test_index = test[~test.negativereason_gold.isna()].index

for index, row in train.iterrows():
    tweet = row.text
    
    row.text = deEmojify(row.text)
    
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet) #remove links
    tweet = re.sub('@[^\s]+','',tweet) #remove usernames
    tweet = re.sub('[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    
    words = []
    for word in tweet.split():
        if not hasNumbers(word):
            if word.lower() not in stops:
                if word in list(abbreviations.keys()):
                    words.append(abbreviations[word])
                else:
                    words.append(word.lower())   
    
    tweet = " ".join(words)
    tweet = " %s %s" % (tweet, row.airline)
    row.text = tweet
    if index in train_index:
        row.text = " %s %s" % (row.text, row.negativereason_gold)

for index, row in test.iterrows():
    tweet = row.text
    row.text = deEmojify(row.text)
    
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet) #remove links
    tweet = re.sub('@[^\s]+','',tweet) #remove usernames
    tweet = re.sub('[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    
    words = []
    for word in tweet.split(): 
        if not hasNumbers(word):
            if word.lower() not in stops:
                if word in list(abbreviations.keys()):
                    words.append(abbreviations[word])
                else:
                    words.append(word.lower())
    
    tweet = " ".join(words)
    tweet = " %s %s" % (tweet, row.airline)
    row.text = tweet
    if index in test_index:
        row.text = " %s %s" % (row.text, row.negativereason_gold)

del train['negativereason_gold']
del test['negativereason_gold']

In [43]:
train.head()
# test.head()

Unnamed: 0,airline_sentiment,airline,text
0,negative,Southwest,"scheduled morning, days fact, yes..not sure e..."
1,positive,Southwest,seeing workers time time going beyond love fl...
2,positive,United,"flew ord miami back great crew, service legs...."
3,negative,Southwest,that's horse radish Southwest
4,negative,United,"ord delayed air force one, last sbn minutes l..."


## Creating vocab formatting data

In [81]:
v = TfidfVectorizer(analyzer='word', max_features=3150, max_df = 0.8, ngram_range=(1,1))
train_features= v.fit_transform(train.text)
test_features=v.transform(test.text)

In [82]:
clf = LogisticRegression(C = 2.1, solver='liblinear', multi_class='auto') #Best Performance. SCORED: 0.7929
clf.fit(train_features,train['airline_sentiment'])
pred = clf.predict(test_features)
with open('logreg.csv', 'w') as f:
    for item in pred:
        f.write("%s\n" % item)

In [66]:
clf = SVC(kernel="linear", C= 0.96 , gamma = 'scale') # SCORED .78
# clf = SVC(C = 1000, gamma = 0.001)
clf.fit(train_features, train['airline_sentiment'])
pred = clf.predict(test_features)

with open('svc.csv', 'w') as f: #less accurate
    for item in pred:
        f.write("%s\n" % item)

In [60]:
# v.get_feature_names()

In [67]:
from sklearn.naive_bayes import MultinomialNB #score: 0.7374
clf = MultinomialNB()
clf.fit(train_features,train['airline_sentiment'])
y_pred = clf.predict(test_features)
with open('nb.csv', 'w') as f: 
    for item in y_pred:
        f.write("%s\n" % item) 