In [13]:
!pip install contractions

import re, string, unicodedata                          # Import Regex, string and unicodedata.
import contractions                                     # Import contractions library.
from bs4 import BeautifulSoup                           # Import BeautifulSoup.

import numpy as np                                      # Import numpy.
import pandas as pd                                     # Import pandas.
import nltk                                             # Import Natural Language Tool-Kit.

nltk.download('stopwords')                              # Download Stopwords.
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords                       # Import stopwords.
from nltk.tokenize import word_tokenize, sent_tokenize  # Import Tokenizer.
from nltk.stem.wordnet import WordNetLemmatizer         # Import Lemmatizer.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
from google.colab import drive
# drive.mount('/gdrive')
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
alldata = pd.read_csv('/content/drive/My Drive/Tweets.csv')

In [16]:
alldata.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [17]:
data = alldata[['airline_sentiment','text']]

In [18]:
data.shape

(14640, 2)

In [19]:
data.head(5)

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [20]:
data.isnull().sum()

airline_sentiment    0
text                 0
dtype: int64

In [21]:
# Making a list of stop words
stopwords = stopwords.words('english')

customlist = ['not', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
        "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
        "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
        "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

stopwords = list(set(stopwords) - set(customlist))      

In [22]:
# Data preprocessing
# -----------------------------------
lemmatizer = WordNetLemmatizer()

def tweetes_preprocessing(words):

# HTML Tag removal
  soup = BeautifulSoup(words, "html.parser")
  no_html = soup.get_text()

# Remove contractions
  fixed_contractions = contractions.fix(no_html)

# Remove numbers and keep only text
  letters_only = re.sub("[^a-zA-Z]", " ", fixed_contractions)

# Convert to small letters
  words = letters_only.lower()

# Tokenize words 
  words = nltk.word_tokenize(words)

#  Remove non-ASCII characters from list of tokenized words
  non_ASCII = []
  for word in words:
    new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    non_ASCII.append(new_word)

# Remove punctuation from list of tokenized words
  no_punctuation = []
  for word in non_ASCII:
    new_word = re.sub(r'[.|,|)|(|\|/]',r' ', word)
    if new_word != '':
        no_punctuation.append(new_word)

# Remove stop words from list of tokenized words
  no_stopwords = []
  for word in no_punctuation:
    if word not in stopwords:
        no_stopwords.append(word)

# Lemmetize words
  lemmatized_words = []
  for word in no_stopwords:
    lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))
  
  return  (" ".join(lemmatized_words))

# -----------------------------------
data['text'] = data['text'].apply(lambda x: tweetes_preprocessing(x))
data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn say
1,positive,virginamerica plus add commercials experience ...
2,neutral,virginamerica not today must mean need take an...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing


In [23]:
print (data['airline_sentiment'].unique())
data['airline_sentiment']= data['airline_sentiment'].replace({'neutral':1, 'positive':2, 'negative':0})
print (data['airline_sentiment'].unique())

['neutral' 'positive' 'negative']
[1 2 0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
labels = data['airline_sentiment']
labels = labels.astype('int')
data_features = data['text']

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(analyzer = "word", max_df=1.0, min_df=1, max_features=None)
data_features= v.fit_transform(data_features)
data_features=data_features.toarray()

In [26]:
# Split data into training and testing set.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_features, labels, test_size=0.3, random_state=42)

In [27]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=200, n_jobs=4)

forest = forest.fit(X_train, y_train)

print(forest)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)


In [28]:
from sklearn.model_selection import cross_val_score
print(np.mean(cross_val_score(forest, data_features, labels, cv=10)))

0.7430327868852459


In [29]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_predict = forest.predict(X_test)
train= forest.score(X_train, y_train)
test= forest.score(X_test, y_test)
modelaccuracy=accuracy_score(y_test, y_predict)
print ('Train data score:',train)
print ('Test score:',test)
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])

Train data score: 0.9961943793911007
Test score: 0.7830145719489982


Predicted,0,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2594,160,60
1,377,426,81
2,179,96,419


In [30]:
# Using TfidfVectorizer to convert text data to numbers.

from sklearn.feature_extraction.text import TfidfVectorizer

data_features = data['text']
vectorizer = TfidfVectorizer(analyzer = "word", max_df=1.0, min_df=1, max_features=None)
data_features= vectorizer.fit_transform(data_features)
data_features=data_features.toarray()


In [31]:
X_train, X_test, y_train, y_test = train_test_split(data_features, labels, test_size=0.3, random_state=42)

In [32]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=200, n_jobs=4)

forest = forest.fit(X_train, y_train)

print(forest)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)


In [33]:
print(np.mean(cross_val_score(forest, data_features, labels, cv=10)))

0.7335382513661203


In [34]:
y_predict = forest.predict(X_test)
train= forest.score(X_train, y_train)
test= forest.score(X_test, y_test)
modelaccuracy=accuracy_score(y_test, y_predict)
print ('Train data score:',train)
print ('Test score:',test)
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])

Train data score: 0.9961943793911007
Test score: 0.7750455373406193


Predicted,0,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2692,94,28
1,489,349,46
2,242,89,363




*   Initial data was raw data from twitter feeds and required multiple preprocessing techniques inculding:
    *   HTML Tag removal: which removes the non meaningfull HTML programming codes.
    *   Contraction removal: which changes appreviated word to its original form
    *   Removing numbers: as they are not needed for this NLP model
    *   Tokenize words: seperates words for inividual preprocessing
    *   Changing all letters to small letters
    *   Remove non-ASCII characters: to convert words with non understandable letters to basic english letters
    *   Removing punctioations: removes dots and commas as not needed here
    *   Removing stop words: not needed words for this NLP model
    *   lemmetizing the words: changes words to its basic form accodring to english dictionary

*   We used here 2 different techniques to analyse the words present in the corpus. Count vetorizer which represent words by its frequency in each document. and Tf-IDF which represnet each word with its frequency in each document and matched with its presance in the other documents whcih gives each word more or less importance depending on its presance in each document

*   Random forest classification was used to identify which document in the test set belongs to which sentimental category

*   There was no siginificant difference in accuracy using both vectorization techniques though it was expected that Tf-IDF show more accuracy compared to count vectroizer. These results are influenecd by the skewness in the data and by the model used.





