**Import necessary libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
import pickle

**Download NLTK stopwords**

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**Load the dataset**

In [None]:
!pip install -U --no-cache-dir gdown --pre

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.7.1


In [None]:
!gdown 12Gnu6jDvK6tLkDqKGOOMzBvTKwrwDyy5

Downloading...
From: https://drive.google.com/uc?id=12Gnu6jDvK6tLkDqKGOOMzBvTKwrwDyy5
To: /content/Tweets.csv
  0% 0.00/3.50M [00:00<?, ?B/s]100% 3.50M/3.50M [00:00<00:00, 216MB/s]


In [None]:
# Load the dataset
import pandas as pd
df = pd.read_csv('/content/Tweets.csv')

**Remove unnecessary columns**

In [None]:
# Remove unnecessary columns
df.drop(['textID'], axis=1, inplace=True)

**Clean the text data**

In [None]:
# Clean the text data
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
    return text

In [None]:
import re
from nltk.corpus import stopwords

df['text'] = df['text'].apply(lambda x: clean_text(x))
df['selected_text'] = df['selected_text'].apply(lambda x: clean_text(x))

**Map the sentiment to binary values**

In [None]:
# Map the sentiment to binary values
df['sentiment'] = df['sentiment'].map({'positive': 2, 
                                       'negative': 0, 'neutral': 1})

**Create feature vectors**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, 
                            preprocessor=None,
                            use_idf=True, norm='l2', smooth_idf=True)
def preprocess_data(df):
    
    # Remove stop words and apply TF-IDF vectorization
    X = tfidf.fit_transform(df['text'])
    x = tfidf.transform(df['text'])
    return X

**Split the dataset into training and testing sets**

In [None]:
from sklearn.model_selection import train_test_split

X = preprocess_data(df) 
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], 
                                                    test_size=0.2, 
                                                    random_state=42)

**Train a logistic regression model**

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Predict the sentiment of test data**

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

y_pred = lr.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", acc)
print("Confusion matrix:\n", cm)

Accuracy: 0.6887393123521921
Confusion matrix:
 [[ 883  592   87]
 [ 248 1734  248]
 [  49  487 1169]]


**Save the trained model**

In [None]:
import pickle as pkl

In [None]:
with open(r"LRPWeights.pkl", "wb") as output_file:
   pkl.dump(lr, output_file)

**Print the results**

In [None]:
filename = '/content/LRPWeights.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
def classify(prediction):
  if prediction == 0:
    return 'Negative'
  elif prediction == 1:
    return 'Neutral'
  else:
    return 'Positive'

In [None]:
import pickle
import numpy as np


data = input('Enter a sentence:')

# Word to Vector
X = tfidf.transform([data])

# Classify Prediction
predictions = lr.predict(X)
predictions = predictions[0]
print('Result:', classify(predictions))

Enter a sentence:Sounds like fun  LoL
Result: Positive


In [None]:
with open(r"TFIDFWeights.pkl", "wb") as output_file:
   pkl.dump(tfidf, output_file)