In [1]:
pip install pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Step 1: Load the dataset
df = pd.read_csv('bjp_tweets.csv')  # make sure this is the correct path

# Step 2: Preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

# Step 3: Apply preprocessing
df['tweet'] = df['tweet'].apply(preprocess)

# Step 4: Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['tweet'])
y = df['target']

# Step 5: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Step 7: Train Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Step 8: Evaluate models
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg.predict(X_test)))
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb.predict(X_test)))

# Step 9: Function to predict sentiment
def predict_sentiment(model, text):
    text = preprocess(text)
    vector = vectorizer.transform([text])
    prediction = model.predict(vector)[0]
    return "Positive" if prediction == 1 else "Negative"

# Step 10: Allow user to enter tweet
while True:
    user_input = input("\nEnter a tweet (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    print("Logistic Regression Prediction:", predict_sentiment(logreg, user_input))
    print("Naive Bayes Prediction:", predict_sentiment(nb, user_input))


Logistic Regression Accuracy: 0.7794058205335489
Naive Bayes Accuracy: 0.7561641067097817
Logistic Regression Prediction: Negative
Naive Bayes Prediction: Negative


In [7]:
import pickle

# Save models and vectorizer
with open("logreg_model.pkl", "wb") as f:
    pickle.dump(logreg, f)

with open("nb_model.pkl", "wb") as f:
    pickle.dump(nb, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

# Save accuracies
logreg_acc = accuracy_score(y_test, logreg.predict(X_test))
nb_acc = accuracy_score(y_test, nb.predict(X_test))

with open("accuracies.pkl", "wb") as f:
    pickle.dump({"logreg": logreg_acc, "nb": nb_acc}, f)
