In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mishj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words('english')) #words that dont have much importance

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [5]:
#data preprocessing
column_names=['target','id','date','flag','user','text']
twitter_data=pd.read_csv('twitter.csv',encoding='ISO-8859-1',header=None,names=column_names)

In [6]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
twitter_data.shape

(1600000, 6)

In [8]:
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [9]:
#checking the distribution of traget values
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [10]:
twitter_data.replace({'target':{4:1}},inplace=True)
print(twitter_data['target'].value_counts())

target
0    800000
1    800000
Name: count, dtype: int64


In [11]:
#0=-ve
#1=+ve

In [12]:
#stemming->to reduce to the root word
port_stem=PorterStemmer()

In [13]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content

In [14]:
# twitter_data['stemmed_content']=twitter_data['text'].apply(stemming)

In [15]:
# print(twitter_data['stemmed_content'])

In [16]:
# twitter_data.to_pickle("stemmed_content.pkl")

# # Load
twitter_data_stemmed = pd.read_pickle("stemmed_content.pkl")


In [17]:
twitter_data_stemmed

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object

In [18]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [28]:
X=twitter_data_stemmed.values

In [29]:
y=twitter_data['target'].values

In [30]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)

In [31]:
#converting textudal data to numerical data
vectorizer = TfidfVectorizer(
    max_features=100000, 
    ngram_range=(1,3),  # Include unigrams, bigrams, and trigrams
    sublinear_tf=True  # Helps large datasets
)

X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [32]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12249876 stored elements and shape (1280000, 100000)>
  Coords	Values
  (0, 92776)	0.226730546053314
  (0, 71690)	0.29843494609144333
  (0, 42020)	0.438964228477445
  (0, 19967)	0.31220992695572786
  (0, 47581)	0.349303261118467
  (0, 95621)	0.37301380683658636
  (0, 93234)	0.5551694711946487
  (2, 19967)	0.28408039996898515
  (2, 21980)	0.13829596346277317
  (2, 84157)	0.13674485997939545
  (2, 23899)	0.21250063834654384
  (2, 83524)	0.19865348118804999
  (2, 91274)	0.24094520499895222
  (2, 13246)	0.22865250952061522
  (2, 95653)	0.2447461135564103
  (2, 53836)	0.17631450558588618
  (2, 84843)	0.1108709091128146
  (2, 40668)	0.11833198086211333
  (2, 31594)	0.13743563887372914
  (2, 24942)	0.14829569231078027
  (2, 58041)	0.12269429245431486
  (2, 22169)	0.20408875639069057
  (2, 20032)	0.3185693847047648
  (2, 85308)	0.25762467608740774
  (2, 83760)	0.21758909179592545
  :	:
  (1279997, 55004)	0.3375482644123121
  (127999

In [33]:
#train the model
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000,C=1),
    "Naïve Bayes (MultinomialNB)": MultinomialNB(alpha=0.1)
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))


Logistic Regression Accuracy: 0.7889
              precision    recall  f1-score   support

    negative       0.80      0.77      0.78    160000
    positive       0.78      0.81      0.79    160000

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000


Naïve Bayes (MultinomialNB) Accuracy: 0.7696
              precision    recall  f1-score   support

    negative       0.77      0.77      0.77    160000
    positive       0.77      0.77      0.77    160000

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [25]:
from sklearn.model_selection import GridSearchCV
log_reg_params = {
    "C": [0.01, 0.1, 1, 10],  # Regularization strength
    "solver": ["liblinear", "lbfgs"],
    "max_iter": [100, 300, 500]
}
log_reg = GridSearchCV(LogisticRegression(), log_reg_params, cv=3, scoring="accuracy", n_jobs=-1)
log_reg.fit(X_train, y_train)
print(f"Best Logistic Regression Params: {log_reg.best_params_}")
print(log_reg.best_score_)

Best Logistic Regression Params: {'C': 1, 'max_iter': 100, 'solver': 'liblinear'}
0.7764789060391557


In [None]:
svm_model = SVC(C=50, kernel='linear', class_weight='balanced')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)