<a href="https://colab.research.google.com/github/JoaquinGonzalezSimon/Data_science_and_ML_from_Medium/blob/main/230327_Twitter_Sentiment_Analysis_using_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This project was taken from the following link

https://thecleverprogrammer.com/2021/09/13/twitter-sentiment-analysis-using-python/

In [1]:
import pandas as pd
import numpy as np

import re
import string
import nltk

from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

In [2]:
nltk.download('vader_lexicon')
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer('english')
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
def clean(text):
  text = str(text).lower()
  text = re.sub(r'\[.*?\]', '', text)
  text = re.sub(r'https?://\S+|www\.\S+', '', text)
  text = re.sub(r'<.*?>+', '', text)
  text = re.sub(r'\n', '', text)
  text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub(r'\w*\d\w*', '', text)
  text = [word for word in text.split(' ') if word not in stopword]
  text = ' '.join(text)
  text = [stemmer.stem(word) for word in text.split(' ')]
  text = ' '.join(text)
  return text

In [4]:
data = pd.read_csv("https://raw.githubusercontent.com/amankharwal/Website-data/master/twitter.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [6]:
data.isnull().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [7]:
data['clean_text'] = data.tweet.apply(clean)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
 7   clean_text          24783 non-null  object
dtypes: int64(6), object(2)
memory usage: 1.5+ MB


In [8]:
data = data.drop('Unnamed: 0', axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   count               24783 non-null  int64 
 1   hate_speech         24783 non-null  int64 
 2   offensive_language  24783 non-null  int64 
 3   neither             24783 non-null  int64 
 4   class               24783 non-null  int64 
 5   tweet               24783 non-null  object
 6   clean_text          24783 non-null  object
dtypes: int64(5), object(2)
memory usage: 1.3+ MB


In [9]:
sentiments = SentimentIntensityAnalyzer()
data['pos'] = [sentiments.polarity_scores(i)['pos'] for i in data.clean_text]
data['neg'] = [sentiments.polarity_scores(i)['neg'] for i in data.clean_text]
data['neu'] = [sentiments.polarity_scores(i)['neu'] for i in data.clean_text]
data['compound'] = [sentiments.polarity_scores(i)['compound'] for i in data.clean_text]

In [10]:
df = data[['clean_text', 'pos', 'neg', 'neu', 'compound']]
df.head()

Unnamed: 0,clean_text,pos,neg,neu,compound
0,rt mayasolov woman shouldnt complain clean ho...,0.147,0.157,0.696,-0.0382
1,rt boy dat coldtyga dwn bad cuffin dat hoe ...,0.0,0.28,0.72,-0.5423
2,rt urkindofbrand dawg rt ever fuck bitch sta...,0.0,0.577,0.423,-0.8979
3,rt cganderson vivabas look like tranni,0.333,0.0,0.667,0.3612
4,rt shenikarobert shit hear might true might f...,0.154,0.407,0.44,-0.6808


In [12]:
x = sum(df.pos)
y = sum(df.neg)
z = sum(df.neu)
j = sum(df.compound)

In [13]:
print('Positive: ', x)
print('Negative: ', y)
print('Neutral: ', z)
print('Compound: ', j)

Positive:  2880.086000000009
Negative:  7201.020999999922
Neutral:  14696.887999999733
Compound:  -6452.106600000134


In [14]:
def sentiment_score(a,b,c):
  if (a>b) and (a>c):
    print('Positive')
  elif (b>a) and (b>c):
    print('Negative')
  else:
    print('Neutral')

In [15]:
sentiment_score(x,y,z)

Neutral
