In [1]:
pip install kaggle



In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 866MB/s]


In [4]:
#extracting the compressed dataset
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print("The dataset is extracted")

The dataset is extracted


In [5]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
#printing the stopwords in english, we want to remove that as useless
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Preprocessing


In [8]:
twitter_data= pd.read_csv('/content/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1')

In [9]:
twitter_data.shape

(1599999, 6)

In [10]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [11]:
column_names=['target','id','date','flag','user','text']
twitter_data= pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names=column_names,encoding='ISO-8859-1')
twitter_data.shape

(1600000, 6)

In [12]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [13]:
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [14]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


convert the target 4 to 1


In [15]:
twitter_data.replace({'target': {4: 1}}, inplace=True)

In [16]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


0 --> Negative tweet                                      
1 --> positive tweet

**Stemming**

Stemming is the process of reducing a word to its Root word


example - actor,actress,acting --- act

In [17]:
port_stem=PorterStemmer()
def stemming(content):
  stemmed_content=re.sub('[^a-zA-Z]',' ',content)
  stemmed_content=stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content=' '.join(stemmed_content)
  return stemmed_content


In [18]:
twitter_data['stemmed_content']=twitter_data['text'].apply(stemming) #takes 1 hour

In [19]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [20]:
print(twitter_data['stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [21]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [22]:
#seperating the data and label
X=twitter_data['stemmed_content'].values
Y=twitter_data['target'].values

In [23]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [24]:
print(Y)

[0 0 0 ... 1 1 1]


Splitting the data into training data and test data


In [23]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [24]:
#converting the textual data to numerical data
vectorizer=TfidfVectorizer()

X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

Training the machine learning model

In [25]:
model=LogisticRegression(max_iter=1000)
model.fit(X_train,Y_train)

Model Evaluation


Accuracy Score

In [26]:
#accuracy score on the training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)
print('Accuracy score of the training data : ',training_data_accuracy*100,"%")

Accuracy score of the training data :  82.87195312499999 %


In [27]:
#accuracy score on the test data
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,Y_test)
print('Accuracy score of the testing data : ',test_data_accuracy*100,"%")
from sklearn.metrics import classification_report, confusion_matrix

Accuracy score of the testing data :  80.668125 %


In [32]:
print("Classification report:\n")
print(classification_report(Y_test, X_test_prediction))
print("Confusion Matrix:")
print(confusion_matrix(Y_test, X_test_prediction))

Classification report:

              precision    recall  f1-score   support

           0       0.79      0.76      0.77    160000
           1       0.77      0.80      0.78    160000

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000

Confusion Matrix:
[[121246  38754]
 [ 32708 127292]]


Using XGboost

In [33]:
from xgboost import XGBClassifier

model2 = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    max_depth=5,
    n_estimators=500,
    learning_rate=0.1,
    random_state=2
)

model2.fit(
    X_train,
    Y_train,
)


In [34]:
Y_pred = model2.predict(X_test)
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

Accuracy: 0.745653125
              precision    recall  f1-score   support

           0       0.79      0.67      0.72    160000
           1       0.71      0.82      0.76    160000

    accuracy                           0.75    320000
   macro avg       0.75      0.75      0.74    320000
weighted avg       0.75      0.75      0.74    320000



Using SVM

In [35]:
from sklearn.svm import LinearSVC

# Train SVM
svm_model = LinearSVC()
svm_model.fit(X_train, Y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_test)
svm_acc = accuracy_score(Y_test, y_pred_svm)
print("SVM Accuracy:", svm_acc)

# Confusion matrix
print("\nClassification report:\n")
print(classification_report(Y_test, y_pred_svm))
print("Confusion Matrix:")
cm_svm = confusion_matrix(Y_test, y_pred_svm)
print(cm_svm)

SVM Accuracy: 0.769671875

Classification report:

              precision    recall  f1-score   support

           0       0.78      0.76      0.77    160000
           1       0.76      0.78      0.77    160000

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000

Confusion Matrix:
[[121305  38695]
 [ 35010 124990]]


From Naive Bayes


In [41]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, Y_train)

# Predict and evaluate
y_pred_nb = nb_model.predict(X_test)
nb_acc = accuracy_score(Y_test, y_pred_nb)
print("Naive Bayes Accuracy:", nb_acc)

# Confusion matrix
cm_nb = confusion_matrix(Y_test, y_pred_nb)
print("\nClassification report:\n")
print(classification_report(Y_test, y_pred_nb))
print("Confusion Matrix:")
cm_svm = confusion_matrix(Y_test, y_pred_nb)
print(cm_svm)

Naive Bayes Accuracy: 0.755815625

Classification report:

              precision    recall  f1-score   support

           0       0.74      0.78      0.76    160000
           1       0.77      0.73      0.75    160000

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000

Confusion Matrix:
[[125317  34683]
 [ 43456 116544]]


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=2)
rf_model.fit(X_train, Y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
rf_acc = accuracy_score(Y_test, y_pred_rf)
print("Random Forest Accuracy:", rf_acc)

# Confusion matrix
cm_rf = confusion_matrix(Y_test, y_pred_rf)
print("\nClassification report:\n")
print(classification_report(Y_test, y_pred_rf))
print("Confusion Matrix:")
cm_svm = confusion_matrix(Y_test, y_pred_rf)
print(cm_svm)

Saving the trained Model

In [None]:
import pickle

# Save the trained model
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer (e.g., TfidfVectorizer or CountVectorizer)
with open('tfidf_vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

Using the saved model for future predictions

In [None]:
#loading the saved model
loaded_model=pickle.load(open('sentiment_model.pkl','rb'))

In [None]:
X_new=X_test[200]
print(Y_test[200])

prediction=loaded_model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The tweet is Negative')
else:
  print('The tweet is Positive')

1
[1]
The tweet is Positive


In [None]:
import joblib
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
import re
import joblib
import tweepy
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
class TwitterClient(object):
    '''
    Twitter Client for sentiment analysis using custom ML model.
    '''
    def __init__(self):
        # Initialize Twitter API v2 Client with your Bearer Token
        self.client = tweepy.Client(bearer_token='AAAAAAAAAAAAAAAAAAAAAPBU2AEAAAAAb4QOXFTX7Kk%2BIKVtMfRgsG3DihY%3DH3yCpE1T2XXrGoAPJM1W6Uqvvj27n269mb0Z8kEky4sBUbVtvr')

        # Load trained sentiment analysis model and vectorizer
        self.model = joblib.load('sentiment_model.pkl')
        self.vectorizer = joblib.load('tfidf_vectorizer.pkl')

        # Initialize stemmer and stopwords
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))

    def clean_tweet(self, tweet):
        '''
        Clean and stem tweet text.
        '''
        # Remove mentions, URLs, and special characters
        tweet = re.sub(r"(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet)
        words = tweet.lower().split()
        stemmed = [self.stemmer.stem(word) for word in words if word not in self.stop_words]
        return ' '.join(stemmed)

    def get_tweet_sentiment(self, tweet):
        '''
        Predict sentiment using trained model.
        '''
        cleaned = self.clean_tweet(tweet)
        vectorized = self.vectorizer.transform([cleaned])
        prediction = self.model.predict(vectorized)[0]
        return 'positive' if prediction == 1 else 'negative'

    def get_tweets(self, query, count=100):
        '''
        Fetch and process tweets using Twitter API v2.
        '''
        tweets = []
        try:
            response = self.client.search_recent_tweets(
                query=f"{query} lang:en -is:retweet",
                max_results=min(count, 100),
                tweet_fields=['text']
            )
            if response.data:
                for tweet in response.data:
                    parsed_tweet = {
                        'text': tweet.text,
                        'sentiment': self.get_tweet_sentiment(tweet.text)
                    }
                    tweets.append(parsed_tweet)
            return tweets
        except Exception as e:
            print("Error:", e)
            return []

def main():
    api = TwitterClient()
    tweets = api.get_tweets(query='Donald Trump', count=100)

    ptweets = [t for t in tweets if t['sentiment'] == 'positive']
    ntweets = [t for t in tweets if t['sentiment'] == 'negative']

    if len(tweets) != 0:
        print("Sentiment analysis done")
        print("Positive tweets percentage: {:.2f} %".format(100 * len(ptweets) / len(tweets)))
        print("Negative tweets percentage: {:.2f} %".format(100 * len(ntweets) / len(tweets)))

        print("\n\nPositive tweets:")
        for tweet in ptweets[:10]:
            print(tweet['text'])

        print("\n\nNegative tweets:")
        for tweet in ntweets[:10]:
            print(tweet['text'])
    else:
        print("No tweets fetched or API limit issue.")

if __name__ == "__main__":
    main()


In [None]:
!pip install streamlit pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.9-py3-none-any.whl.metadata (9.3 kB)
Downloading pyngrok-7.2.9-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.9


In [None]:
%%writefile app.py
import streamlit as st

st.title("Twitter Sentiment Analysis")
st.write("This is a demo Streamlit app running in Google Colab!")

Writing app.py


In [None]:
from pyngrok import ngrok

ngrok.set_auth_token("replace with token")

In [None]:
pip install matplotlib



In [None]:
from pyngrok import ngrok
import matplotlib

In [None]:
!pkill streamlit
!pkill ngrok

In [None]:
from pyngrok import ngrok
public_url = ngrok.connect(8501)
print("Streamlit URL:", public_url)

!streamlit run app.py &> /dev/null &

Streamlit URL: NgrokTunnel: "https://fb7c-35-231-152-3.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!zip -r sentiment_app.zip app.py twitter_client.py sentiment_model.pkl tfidf_vectorizer.pkl training.1600000.processed.noemoticon.csv


  adding: app.py (deflated 58%)
  adding: twitter_client.py (deflated 62%)
  adding: sentiment_model.pkl (deflated 6%)
  adding: tfidf_vectorizer.pkl (deflated 56%)
  adding: training.1600000.processed.noemoticon.csv (deflated 64%)
