In [1]:
#Data Analysis
import nltk
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
#Data Preprocessing and Feature Engineering

import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from textblob import TextBlob
from sklearn.svm import LinearSVC

In [2]:
train_tweets = shuffle(pd.read_csv("./input/dataset1.csv"))
test_tweets = shuffle(pd.read_csv("./input/dataset2.csv"))

In [3]:
print("Any missing sample in training set:",train_tweets.isnull().values.any())
print("Any missing sample in test set:",test_tweets.isnull().values.any(), "\n")

Any missing sample in training set: False
Any missing sample in test set: False 



In [4]:
def form_sentence(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

#print(form_sentence(train_tweets['Answers'].iloc[10]))
#print(train_tweets['Answers'].iloc[10])

In [5]:

def no_user_alpha(tweet):
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    return clean_mess
#print(no_user_alpha(form_sentence(train_tweets['Answers'].iloc[10])))
#print(train_tweets['Answers'].iloc[10])

In [6]:
def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return normalized_tweet
    
tweet_list = 'I was playing with my friends with whom I used to play, when you called me yesterday'.split()
print(normalization(tweet_list))

['I', 'be', 'play', 'with', 'my', 'friends', 'with', 'whom', 'I', 'use', 'to', 'play,', 'when', 'you', 'call', 'me', 'yesterday']


In [7]:

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer ='word')),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LinearSVC()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [8]:
msg_train, msg_test, label_train, label_test = train_test_split(train_tweets['Answers'], train_tweets['Labels'], test_size=0.2)
pipeline.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

    negative       0.92      0.84      0.88      1875
     neutral       0.59      0.69      0.63       532
    positive       0.69      0.80      0.74       398

    accuracy                           0.81      2805
   macro avg       0.73      0.78      0.75      2805
weighted avg       0.82      0.81      0.81      2805

[[1582  209   84]
 [ 106  365   61]
 [  33   47  318]]
0.8074866310160428


In [9]:
test_tweets = shuffle(pd.read_csv("./input/dataset2.csv"))
X=np.asarray(test_tweets)
num_rows, num_cols = X.shape

i = 0
neg=0
pos=0
nue=0
while i < num_rows:
    if(X[i][1]=="negative"):
        neg+=1
    elif(X[i][1]=="positive"):
        pos+=1
    elif(X[i][1]=="neutral"):
        nue+=1
    i += 1

pos1=pos*10
nue1=nue*5

total=(pos1+nue1)/num_rows
print("Rating =",total,"/ 5")



Rating = 2.394911504424779 / 5
