1. FETCHING DATASET FROM DRIVE

In [7]:
import pandas
train_df=pandas.read_csv("Corona_NLP_train.csv",encoding='latin-1')
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [8]:
test_df=pandas.read_csv("Corona_NLP_test.csv")
test_df.head()


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


2.TRAINING OF DATASET WITH PREPROCESSING

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import pickle

def train_logistic_regression(sentences, labels, max_features=5000, test_size=0.3, random_state=42, max_iter=1000):
    """
    Trains a Logistic Regression model for text classification.

    Args:
        sentences (list): List of text sentences.
        labels (list): List of corresponding labels.
        max_features (int): Maximum number of features for TF-IDF.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Seed for the random number generator.
        max_iter (int): Maximum number of iterations for Logistic Regression.

    Returns:
        tuple: Trained Logistic Regression model and TF-IDF vectorizer.
    """

    # 1. Feature Extraction (TF-IDF)
    vectorizer = TfidfVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(sentences)

    # 2. Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        labels,
        test_size=test_size,
        random_state=random_state
    )

    # 3. Model Training
    model = LogisticRegression(max_iter=max_iter, solver='lbfgs', multi_class='auto')
    model.fit(X_train, y_train)

    # 4. Model Evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    return model, vectorizer

3.TESTING OF MODEL/ PREDICTING FROM MODEL

In [10]:
def predict_logistic_regression(model, vectorizer, new_sentences):
    """
    Makes predictions using a trained Logistic Regression model.

    Args:
        model (LogisticRegression): Trained Logistic Regression model.
        vectorizer (TfidfVectorizer): Trained TF-IDF vectorizer.
        new_sentences (list): List of new sentences to predict.

    Returns:
        numpy.ndarray: Predicted labels.
    """
    new_X = vectorizer.transform(new_sentences)
    new_predictions = model.predict(new_X)
    return new_predictions


4.SAVING AND LOADING OF MODEL

In [11]:
def save_model(model, vectorizer, model_filename="logistic_model.pkl", vectorizer_filename="tfidf_vectorizer.pkl"):
    """
    Saves the trained model and vectorizer to disk.

    Args:
        model (LogisticRegression): Trained Logistic Regression model.
        vectorizer (TfidfVectorizer): Trained TF-IDF vectorizer.
        model_filename (str): Filename to save the model.
        vectorizer_filename (str): Filename to save the vectorizer.
    """
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    with open(vectorizer_filename, 'wb') as f:
        pickle.dump(vectorizer, f)

def load_model(model_filename="logistic_model.pkl", vectorizer_filename="tfidf_vectorizer.pkl"):
    """
    Loads the trained model and vectorizer from disk.

    Args:
        model_filename (str): Filename of the saved model.
        vectorizer_filename (str): Filename of the saved vectorizer.

    Returns:
        tuple: Loaded Logistic Regression model and TF-IDF vectorizer.
    """
    with open(model_filename, 'rb') as f:
        loaded_model = pickle.load(f)
    with open(vectorizer_filename, 'rb') as f:
        loaded_vectorizer = pickle.load(f)
    return loaded_model, loaded_vectorizer


5.MAIN SECTION

In [12]:
#Example usage:
test_df = test_df.drop(
    columns=['UserName','ScreenName','Location','TweetAt'],
    axis=1
)

In [13]:
sentences = train_df['OriginalTweet'].tolist()
labels = train_df['Sentiment'].tolist()

train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [14]:
#training of model
model, vectorizer = train_logistic_regression(sentences, labels)

#testing of model
new_sentences = test_df['OriginalTweet'].tolist()
predictions = predict_logistic_regression(model, vectorizer, new_sentences)
print("Predictions:", predictions)

#Saving model
save_model(model, vectorizer)

loaded_model, loaded_vectorizer = load_model()

loaded_predictions = predict_logistic_regression(loaded_model, loaded_vectorizer, new_sentences)
print("Loaded predictions:", loaded_predictions)



Accuracy: 0.5677
Classification Report:
                    precision    recall  f1-score   support

Extremely Negative       0.61      0.47      0.53      1572
Extremely Positive       0.68      0.54      0.60      1989
          Negative       0.50      0.52      0.51      3005
           Neutral       0.63      0.67      0.65      2292
          Positive       0.53      0.60      0.56      3490

          accuracy                           0.57     12348
         macro avg       0.59      0.56      0.57     12348
      weighted avg       0.57      0.57      0.57     12348

Predictions: ['Neutral' 'Positive' 'Extremely Positive' ... 'Neutral'
 'Extremely Negative' 'Positive']
Loaded predictions: ['Neutral' 'Positive' 'Extremely Positive' ... 'Neutral'
 'Extremely Negative' 'Positive']


THE FINAL CODE AFTER APPLYING PIPELINE

In [15]:
import pandas
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import pickle

# Load the data
train_df = pandas.read_csv("Corona_NLP_train.csv", encoding='latin-1')
test_df = pandas.read_csv("Corona_NLP_test.csv")

# Data preprocessing
train_df = train_df.drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'], axis=1)

# Prepare sentences and labels
sentences = train_df['OriginalTweet'].tolist()
labels = train_df['Sentiment'].tolist()

# Label Encoding
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Create the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto'))
])

# Train the pipeline
pipeline.fit(sentences, labels_encoded)

# Make predictions on test data
test_sentences = test_df['OriginalTweet'].tolist()
predictions_encoded = pipeline.predict(test_sentences)

# Decode predictions back to original labels
predictions = label_encoder.inverse_transform(predictions_encoded)

# Evaluate the pipeline
X_train, X_test, y_train, y_test = train_test_split(sentences, labels_encoded, test_size=0.3, random_state=42)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Predictions:", predictions)

# Save the pipeline and label encoder
with open('sentiment_pipeline.pkl', 'wb') as f:
    pickle.dump((pipeline, label_encoder), f)

# Load the pipeline and label encoder
with open('sentiment_pipeline.pkl', 'rb') as f:
    loaded_pipeline, loaded_label_encoder = pickle.load(f)

# Make predictions using loaded pipeline
loaded_predictions_encoded = loaded_pipeline.predict(test_sentences)
loaded_predictions = loaded_label_encoder.inverse_transform(loaded_predictions_encoded)
print("Loaded predictions:", loaded_predictions)



Accuracy: 0.7265
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.65      0.71      1572
           1       0.81      0.69      0.74      1989
           2       0.69      0.69      0.69      3005
           3       0.76      0.81      0.78      2292
           4       0.68      0.76      0.72      3490

    accuracy                           0.73     12348
   macro avg       0.75      0.72      0.73     12348
weighted avg       0.73      0.73      0.73     12348

Predictions: ['Neutral' 'Positive' 'Extremely Positive' ... 'Neutral'
 'Extremely Negative' 'Positive']
Loaded predictions: ['Neutral' 'Positive' 'Extremely Positive' ... 'Neutral'
 'Extremely Negative' 'Positive']
