<a href="https://colab.research.google.com/github/Kamal-018/Sentiment_Analysis/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install kaggle



In [1]:
# configuring the path of json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
# API to fecth dataset from the kaggle
! kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 83% 67.0M/80.9M [00:00<00:00, 123MB/s]
100% 80.9M/80.9M [00:00<00:00, 137MB/s]


In [3]:
# extracting from the compressed zip file
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'
with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('dataset extracted')

dataset extracted


In [4]:
# importing the required libraries
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
#reading the csv file
tweets = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [None]:
tweets.shape

(1599999, 6)

In [None]:
tweets.head(1)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...


In [9]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
tweets = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names =column_names, encoding='ISO-8859-1')

In [10]:
tweets.head(1)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."


In [11]:
#dropping the null datasets
tweets.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [None]:
tweets['target'].value_counts()  #balance dataset

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [12]:
tweets.replace({'target':{4:1}}, inplace=True)

In [13]:
tweets['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


In [17]:
#stemming process to find out the root words
english_stopwords = set(stopwords.words('english'))

def stem_text(text):

    # remove non-alphabetic characters and convert to lowercase
    stemmed_text = re.sub('[^A-Za-z]', ' ', text)
    stemmed_text = stemmed_text.lower()

    # tokenize the tweet
    stemmed_text = stemmed_text.split()

    # stem each token without stemming the stopwords
    stemmed_tokens = []
    for token in stemmed_text:
        if token not in english_stopwords:
            stemmed_token = PorterStemmer().stem(token)
            stemmed_tokens.append(stemmed_token)

    stemmed_text = stemmed_tokens

    # Join the stemmed words back into a single string
    stemmed_text = ' '.join(stemmed_text)

    return stemmed_text


In [45]:
test_tweets = tweets[['target', 'text']]

In [46]:
test_tweets.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [47]:
tweets_0 = test_tweets[test_tweets['target'] == 0]
tweets_1 = test_tweets[test_tweets['target'] == 1]

In [48]:
n_samples = 100000
tweets_sample_0 = tweets_0.sample(n=n_samples, random_state=42)
tweets_sample_1 = tweets_1.sample(n=n_samples, random_state=42)


In [49]:
tweets_balanced = pd.concat([tweets_sample_0, tweets_sample_1]).sample(frac=1, random_state=42)


In [50]:
tweets_balanced.head()

Unnamed: 0,target,text
939310,1,Looks like the sun finally located Trondheim ;...
197472,0,A long weekend begins. The sun is shining and ...
1010754,1,to the beach we go! hope it stays nice...
103714,0,@JBFutureboy I missed it busted need to do a ...
230564,0,Why I can't change my background image??


In [51]:
tweets_balanced['stemmed_text'] = tweets_balanced['text'].apply(stem_text)

In [52]:
X = tweets_balanced['text'].values
Y = tweets_balanced['target'].values
X_train, X_test, Y_train, Y_test = train_test_split(
                                                    X, Y,
                                                    test_size = 0.2,    # 80% training, 20% testing
                                                    stratify = Y,       # Maintaining the distribution of the target column
                                                    random_state = 2    # For reproducibility
                                                    )

In [53]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [54]:
model = LogisticRegression(max_iter=500)
model.fit(X_train, Y_train)

In [55]:
X_train_predictions = model.predict(X_train)
training_accuracy = accuracy_score(Y_train, X_train_predictions)

In [56]:
print(training_accuracy*100)

83.251875


In [57]:
X_test_predictions = model.predict(X_test)
testing_accuracy = accuracy_score(Y_test, X_test_predictions)

In [58]:
print(testing_accuracy*100)

79.2425
