In [3]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mayur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
col_names = ['target', 'id', 'date', 'flag', 'user', 'text']
data = pd.read_csv('training.1600000.processed.noemoticon.csv.zip', encoding='ISO-8859-1', names=col_names)
print(data.shape)
print(data.head())

(1600000, 6)
   target          id                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [5]:

# Only use the 'text' and 'target' columns
data = data[['target', 'text']]
# Replace target label 4 with 1 (so target is 0=negative, 1=positive)
data['target'] = data['target'].replace(4, 1)

In [6]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [7]:
def preprocess(text):
    # Remove non-letters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Lowercase
    text = text.lower()
    # Tokenize
    words = text.split()
    # Remove stopwords and stem each word
    words = [ps.stem(word) for word in words if word not in stop_words]
    # Join words back into a single string
    return ' '.join(words)

In [8]:
# Apply preprocessing to all tweets (this might take some time with the full dataset!)
data['text'] = data['text'].apply(preprocess)


In [9]:
# 8. Split data into train and test sets
X = data['text']
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# 9. Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# 10. Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


In [None]:
# 11. Evaluate model accuracy
y_pred = model.predict(X_test_vec)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)

In [None]:
# 12. Predict sentiment for new tweets and print True/False
sample_tweets = ["I love this!", "I hate that!", "It was okay, not great."]
sample_vec = vectorizer.transform(sample_tweets)
predictions = model.predict(sample_vec)

# Output True if positive (label 1), False if negative (label 0)
output = [bool(pred) for pred in predictions]
print(output)  # Example result: [True, False, False]

In [None]:
# Vectorize the test tweets (from dataset)
X_test_vec = vectorizer.transform(X_test)

# Predict sentiment for test tweets
test_predictions = model.predict(X_test_vec)

# Convert predictions: True for positive, False for negative
output = [bool(pred) for pred in test_predictions]

# Print each tweet with its True/False sentiment
for tweet, sentiment in zip(X_test, output):
    print(tweet, sentiment)


In [None]:
import pickle

# Suppose your trained model is in variable `model`
# and your text vectorizer (CountVectorizer / TfidfVectorizer) is in variable `vectorizer`

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ Model and vectorizer saved successfully!")
