# Importing all the required modules

In [88]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd

# Reading the data from CSV file

In [30]:
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

In [31]:
train_df.head(3)

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,5.679e+17,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,17-02-2015 20:16,Washington D.C.,Atlantic Time (Canada)
1,5.69989e+17,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,23-02-2015 14:36,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,5.68089e+17,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,18-02-2015 08:46,Illinois,Central Time (US & Canada)


In [32]:
train_df.drop(['tweet_id', 'airline', 'name', 'negativereason_gold', 'retweet_count', 'tweet_coord', 'tweet_created', 'user_timezone'], axis = 1, inplace = True)
train_df.fillna('NaN', inplace = True)
train_df['Combined_text'] = train_df['airline_sentiment_gold'] + train_df['text'] + train_df['tweet_location']
train_df.drop(['airline_sentiment_gold', 'text', 'tweet_location'], axis = 1, inplace = True)
y = train_df.pop('airline_sentiment')
train_df.insert(1, 'airline_sentiment', y)
train_df.head()

Unnamed: 0,Combined_text,airline_sentiment
0,NaN@SouthwestAir I am scheduled for the mornin...,negative
1,NaN@SouthwestAir seeing your workers time in a...,positive
2,NaN@united Flew ORD to Miami and back and had...,positive
3,NaN@SouthwestAir @dultch97 that's horse radish...,negative
4,NaN@united so our flight into ORD was delayed ...,negative


In [33]:
test_df.drop(['tweet_id', 'airline', 'name', 'negativereason_gold', 'retweet_count', 'tweet_coord', 'tweet_created', 'user_timezone'], axis = 1, inplace = True)
test_df.fillna('NaN', inplace = True)
test_df['Combined_text'] = test_df['airline_sentiment_gold'] + test_df['text'] + test_df['tweet_location']
test_df.drop(['airline_sentiment_gold', 'text', 'tweet_location'], axis = 1, inplace = True)
test_df.head()

Unnamed: 0,Combined_text
0,NaN@AmericanAir In car gng to DFW. Pulled over...
1,"NaN@AmericanAir after all, the plane didn’t la..."
2,NaN@SouthwestAir can't believe how many paying...
3,NaN@USAirways I can legitimately say that I wo...
4,NaN@AmericanAir still no response from AA. gre...


In [34]:
train_data = train_df.values
train_data[0]

array(['NaN@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled FlightledWashington D.C.',
       'negative'], dtype=object)

In [52]:
test_data = test_df.values
test_data[0]

array(["NaN@AmericanAir In car gng to DFW. Pulled over 1hr ago - very icy roads. On-hold with AA since 1hr. Can't reach arpt for AA2450. Wat 2 do?Texas"],
      dtype=object)

# Cleaning the Data

In [36]:
sentiments = [sentiment for text, sentiment in train_data]
sentiments[:3]

['negative', 'positive', 'positive']

### Removing Stop Words

In [65]:
stop = stopwords.words('english')
punc = list(string.punctuation)
stop = stop + punc + ['nan'] + ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']

In [66]:
train_reviews = []
for review, sentiment in train_data:
    words = word_tokenize(review)
#     clean_words = [w for w in words if not w.lower() in stop]
#     reviews.append(clean_words)
    train_reviews.append(words)
train_reviews[0]

['NaN',
 '@',
 'SouthwestAir',
 'I',
 'am',
 'scheduled',
 'for',
 'the',
 'morning',
 ',',
 '2',
 'days',
 'after',
 'the',
 'fact',
 ',',
 'yes',
 '..',
 'not',
 'sure',
 'why',
 'my',
 'evening',
 'flight',
 'was',
 'the',
 'only',
 'one',
 'Cancelled',
 'FlightledWashington',
 'D.C',
 '.']

In [67]:
test_reviews = []
for review in test_data:
    words = word_tokenize(review[0])
    test_reviews.append(words)
test_reviews[0]

['NaN',
 '@',
 'AmericanAir',
 'In',
 'car',
 'gng',
 'to',
 'DFW',
 '.',
 'Pulled',
 'over',
 '1hr',
 'ago',
 '-',
 'very',
 'icy',
 'roads',
 '.',
 'On-hold',
 'with',
 'AA',
 'since',
 '1hr',
 '.',
 'Ca',
 "n't",
 'reach',
 'arpt',
 'for',
 'AA2450',
 '.',
 'Wat',
 '2',
 'do',
 '?',
 'Texas']

### Lemmatizing the reviews

In [68]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [69]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w]) #Passing array to POS Tag
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [70]:
lemmatizer = WordNetLemmatizer()
train_reviews = [clean_review(review) for review in train_reviews]
train_reviews[0]

['southwestair',
 'schedule',
 'morning',
 'day',
 'fact',
 'yes',
 '..',
 'sure',
 'even',
 'flight',
 'one',
 'cancelled',
 'flightledwashington',
 'd.c']

In [71]:
test_reviews = [clean_review(review) for review in test_reviews]
test_reviews[0]

['americanair',
 'car',
 'gng',
 'dfw',
 'pulled',
 '1hr',
 'ago',
 'icy',
 'road',
 'on-hold',
 'aa',
 'since',
 '1hr',
 'ca',
 "n't",
 'reach',
 'arpt',
 'aa2450',
 'wat',
 'texas']

# Using CountVectorizer

In [73]:
train_data = [" ".join(review) for review in train_reviews]
train_data[0]

'southwestair schedule morning day fact yes .. sure even flight one cancelled flightledwashington d.c'

In [74]:
test_data = [" ".join(review) for review in test_reviews]
test_data[0]

"americanair car gng dfw pulled 1hr ago icy road on-hold aa since 1hr ca n't reach arpt aa2450 wat texas"

In [175]:
count_vec = CountVectorizer(max_features = 3000, ngram_range = (1,2))
train_features = count_vec.fit_transform(train_data)
train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [176]:
count_vec.get_feature_names()

['00',
 '000',
 '000 mile',
 '03',
 '10',
 '10 day',
 '10 hour',
 '10 hr',
 '10 min',
 '10 minute',
 '100',
 '106',
 '10pm',
 '11',
 '12',
 '12 hour',
 '122',
 '13',
 '130',
 '14',
 '15',
 '15 min',
 '15 minute',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1hr',
 '1k',
 '1st',
 '1st class',
 '20',
 '20 min',
 '20 minute',
 '200',
 '200 fee',
 '2015',
 '21',
 '22',
 '23',
 '24',
 '24 hour',
 '24 hr',
 '24hrs',
 '25',
 '25 min',
 '26',
 '27',
 '28',
 '2day',
 '2hrs',
 '2nd',
 '2nd time',
 '2x',
 '30',
 '30 min',
 '30 minute',
 '300',
 '30am',
 '32',
 '33',
 '35',
 '36',
 '37',
 '38',
 '39',
 '3hrs',
 '3rd',
 '40',
 '40 min',
 '40 minute',
 '400',
 '41',
 '42',
 '44',
 '45',
 '45 min',
 '45 minute',
 '47',
 '48',
 '4th',
 '50',
 '50 min',
 '50 minute',
 '500',
 '55',
 '59',
 '5hrs',
 '5th',
 '60',
 '600',
 '70',
 '700',
 '728',
 '73',
 '737',
 '75',
 '76',
 '77',
 '777',
 '7am',
 '80',
 '800',
 '800 number',
 '86',
 '872957',
 '90',
 '90 min',
 '96513',
 '96513 73',
 '99',
 'a320',
 'aa',
 'abl

In [177]:
test_features = count_vec.transform(test_data)

## Using SVC

In [178]:
svc = SVC()
svc.fit(train_features, sentiments)

SVC()

In [179]:
test_sentiments = svc.predict(test_features)

In [180]:
test_sentiments[:10]

array(['negative', 'negative', 'negative', 'negative', 'positive',
       'negative', 'neutral', 'negative', 'negative', 'negative'],
      dtype='<U8')

## Using Random Forest

In [103]:
rfc = RandomForestClassifier()
rfc.fit(train_features, sentiments)

RandomForestClassifier()

In [104]:
test_sentiments = rfc.predict(test_features)
test_sentiments[:10]

array(['negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'neutral', 'negative', 'negative', 'negative'],
      dtype='<U8')

In [181]:
op_df = pd.DataFrame(test_sentiments)
op_df.to_csv('predictions.csv', index = False, header = False)