# Twitter Sentiment Analysis 

## Importing Required Libraries and modules 

In [1]:
import numpy as np 
import pandas as pd  
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Loading the data 

In [2]:
# Loading training dataset
training= pd.read_csv('training.csv',delimiter=',')

# Loading testing dataset
testing= pd.read_csv('testing.csv',delimiter=',')

In [3]:
training.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
testing.head()

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)


In [5]:
x_train=training['text']
x_train=np.array(x_train)
x_train

array(['@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled',
       '@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!',
       '@united Flew ORD to Miami and back and  had great crew, service on both legs. THANKS',
       ..., '@usairways the. Worst. Ever. #dca #customerservice',
       '@nrhodes85: look! Another apology. DO NOT FLY @USAirways',
       '@united you are by far the worst airline. 4 plane delays on 1 round trip flight. How is that possible.'],
      dtype=object)

In [6]:
y_train=training['airline_sentiment']
y_train=np.array(y_train)
y_train

array(['negative', 'positive', 'positive', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [7]:
x_train.shape,y_train.shape

((10980,), (10980,))

In [8]:
x_test=testing['text'].values
x_test=np.array(x_test)
x_test

array(["@AmericanAir In car gng to DFW. Pulled over 1hr ago - very icy roads. On-hold with AA since 1hr. Can't reach arpt for AA2450. Wat 2 do?",
       '@AmericanAir after all, the plane didn’t land in identical or worse) conditions at GRK according to METARs.',
       "@SouthwestAir can't believe how many paying customers you left high and dry with no reason for flight Cancelled Flightlations Monday out of BDL! Wow.",
       ...,
       'Nice RT @VirginAmerica: The man of steel might be faster, but we have WiFi – just saying. #ScienceBehindTheExperience http://t.co/FGRbpAZSiX',
       '@AmericanAir Aww Thanks AA..DFW was on GMA up here this AM..so i understand ..Btw A.A is my Airline when im able to trv..Love you guys.:)',
       '@united the lounge tells us they have no pillows for my grandma as one of the ladies opens the closet and I see 2 right there. #unitedlies'],
      dtype=object)

In [9]:
x_test.shape

(3660,)

In [10]:
# Getting the lemmatizer 
lemmatizer=WordNetLemmatizer()

In [11]:
# Making a function to simplify the POS tag which is required for lemmatizer 


def Get_Simple_POS(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    
    if tag.startswith('V'):
        return wordnet.VERB
    
    if tag.startswith('N'):
        return wordnet.NOUN
    
    if tag.startswith('R'):
        return wordnet.ADJ
    
    else:
        return wordnet.NOUN

## Getting the stopwords and adding punctuations to the list

In [12]:
stops=set(stopwords.words('english'))
punctuations=list(string.punctuation)
stops.update(punctuations)

In [13]:
# Number of stopwords 
len(stops)

211

In [14]:
def Clean_Reviews(words):
    
    output_words=[]
    
    for word in words:
        if word.lower() not in stops:
            pos=pos_tag([word])
            clean_word= lemmatizer.lemmatize(
                word,pos= Get_Simple_POS(pos[0][1]))
                
            output_words.append(clean_word.lower())
            
    return " ".join(output_words)

In [15]:
x_train=[Clean_Reviews(word_tokenize(i)) for i in x_train]

In [16]:
x_train[10]

"southwestair mco- gt dca flight almost full people screw msy-dca cancelled flightation united usairways n't cancelled flight swa=mistake"

In [17]:
x_test = [Clean_Reviews(word_tokenize(i)) for i in x_test]

In [18]:
count_vector = CountVectorizer(max_features = 10025 , ngram_range=(1,3)) 

In [19]:
count_vector

In [20]:
x_train_features = count_vector.fit_transform(x_train) 

In [21]:
X_train = x_train_features.todense()

In [22]:
x_test_features = count_vector.transform(x_test)

In [23]:
X_test = x_test_features.todense()

In [24]:
# Importing classifiers 

from sklearn.naive_bayes import MultinomialNB 
from sklearn.ensemble import RandomForestClassifier

In [25]:
# Using Naive Bayes 

# Training our classifier 
clf_1 = MultinomialNB() 
clf_1.fit(x_train_features , y_train)

# Making predictions 
Y_pred = clf_1.predict(x_test_features)

In [26]:
# Making a Dataframe of Predictions and storing them in .csv format
Y_pred=pd.DataFrame(Y_pred)
Y_pred.to_csv('prediction.csv',sep=',', index=False,header=False)

In [28]:
clf_1.score(x_test_features,Y_pred)

1.0

In [30]:
# Using Random Fr 

# Training our classifier 

clf_2=RandomForestClassifier(n_estimators=2000, n_jobs=-1)
clf_2.fit(x_train_features, y_train)
y_test=clf_2.predict(x_test_features)

In [31]:
clf_2.score(x_test_features,y_test)

1.0