# Importing the Dependencies

In [48]:
import pandas as pd
import numpy as np
import re
#plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
#nltk
from nltk.stem import WordNetLemmatizer
#sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report

# Read and Load the Dataset

In [49]:
DATASET_COLUMNS=['target','ids','date','flag','user','text']
DATASET_ENCODING = "ISO-8859-1"
df=pd.read_csv('Newtweet.csv',encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

In [50]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,target,ids,date,flag,user,text
1,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
2,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
3,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
4,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire


# Exploratory Data Analysis

In [51]:
#columns/features in data
df.columns

Index(['target', 'ids', 'date', 'flag', 'user', 'text'], dtype='object')

In [52]:
#length of the dataset
len(df)

2159

In [53]:
#shape of the dataset
df.shape

(2159, 6)

In [54]:
#information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159 entries, 0 to 2158
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  2159 non-null   object
 1   ids     2159 non-null   object
 2   date    2159 non-null   object
 3   flag    2159 non-null   object
 4   user    2159 non-null   object
 5   text    2159 non-null   object
dtypes: object(6)
memory usage: 101.3+ KB


In [55]:
#data types of all columns
df.dtypes

target    object
ids       object
date      object
flag      object
user      object
text      object
dtype: object

In [56]:
#checking null values 
df.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [57]:
#Checking how many rows and columns in the dataset
print(len(df.columns))
print(len(df))

6
2159


In [58]:
#checking unique values in the target columns
df['target'].unique()

array(['target', '0'], dtype=object)

In [59]:
df['target'].nunique()

2

In [60]:
x=df['text'].values

In [61]:
x

array(['text',
       "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
       "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
       ..., 'has got an upset stomach ',
       "Just found out an outbuilding at one of my other houses has been broken into. Again. That's probably the 6th or 7th time now ",
       "Ugh it's APRIL not DECEMBER there should NOT be snow on the ground &amp; -6 outside "],
      dtype=object)

# Stemming

In [62]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
from nltk.stem.porter import PorterStemmer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
port_stem = PorterStemmer()

In [64]:
def stemming(content):
    stem_content = re.sub('[^a-zA-Z]',' ', str(content))
    stem_content = stem_content.lower()
    stem_content = stem_content.split()
    stem_content = [port_stem.stem(word) for word in stem_content if not word in stopwords.words('english')]
    stem_content = ' '.join(stem_content)
    return stem_content

In [65]:
df['text'] = df['text'].apply(stemming)

In [66]:
x = df['text']

In [67]:
x

0                                                    text
1       switchfoot http twitpic com zl awww bummer sho...
2       upset updat facebook text might cri result sch...
3       kenichan dive mani time ball manag save rest g...
4                         whole bodi feel itchi like fire
                              ...                        
2154    silkywoven sadli true footbal definit better c...
2155    rocmoney ha plan quot dramat walk speech quot ...
2156                                    got upset stomach
2157    found outbuild one hous broken probabl th th time
2158              ugh april decemb snow ground amp outsid
Name: text, Length: 2159, dtype: object

# Sentiment Analysis

In [68]:
sentiment = SentimentIntensityAnalyzer()

In [69]:
def determine(text):
    a = sentiment.polarity_scores(text)
    key_max = max(a, key = a.get)
    return key_max
     

In [70]:
df['Result'] = df['text'].apply(determine)

In [71]:
df['Result']

0            neu
1            neu
2            neu
3            neu
4            neu
          ...   
2154    compound
2155         neu
2156         neg
2157         neu
2158         neu
Name: Result, Length: 2159, dtype: object

In [72]:
df['Result'].value_counts()

neu         1697
neg          224
compound     166
pos           72
Name: Result, dtype: int64

In [73]:
# Hence we Analysing the Twitter sentiment 
# Here
# neu -> netural text
# neg -> negative text
# pos -> positive text
# compound -> either positve and negative text 