In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk import word_tokenize
import pickle

nltk.download('stopwords')
nltk.download('punkt')

# Load the data
val_data = pd.read_csv('C:/Users/Hari Haran/OneDrive/Desktop/GUVI/Project_1 _youtube_harvesting/ML_Projects/twitter_validation.csv')
train_data = pd.read_csv('C:/Users/Hari Haran/OneDrive/Desktop/GUVI/Project_1 _youtube_harvesting/ML_Projects/twitter_training.csv')

# Preprocess the data
train_data.columns = ['id', 'information', 'type', 'text']
val_data.columns = ['id', 'information', 'type', 'text']

train_data["lower"] = train_data.text.str.lower() # Lowercase
train_data["lower"] = [str(data) for data in train_data.lower] # Convert all to string
train_data["lower"] = train_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) # Regex

val_data["lower"] = val_data.text.str.lower() # Lowercase
val_data["lower"] = [str(data) for data in val_data.lower] # Convert all to string
val_data["lower"] = val_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x))

# Choosing English stopwords
stopwords = nltk.corpus.stopwords.words('english')

# Initial Bag of Words
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    stop_words=stopwords,
    ngram_range=(1, 4) # Using ngram_range (1,4) as in the final model
)

reviews_train, reviews_test = train_test_split(train_data, test_size=0.2, random_state=42)

# Creation of encoding related to train dataset
X_train_bow = bow_counts.fit_transform(reviews_train.lower)

# Transformation of test dataset with train encoding
X_test_bow = bow_counts.transform(reviews_test.lower)

y_train_bow = reviews_train['type']
y_test_bow = reviews_test['type']

# Model creation and training
model1 = LogisticRegression(C=1, solver="liblinear", max_iter=200)
model1.fit(X_train_bow, y_train_bow)

# Prediction
test_pred = model1.predict(X_test_bow)
print("Accuracy: ", accuracy_score(y_test_bow, test_pred))
print(classification_report(y_test_bow, test_pred))

# Validation data
X_val_bow = bow_counts.transform(val_data.lower)
y_val_bow = val_data['type']

Val_res = model1.predict(X_val_bow)
print("Accuracy: ", accuracy_score(y_val_bow, Val_res) * 100)

le = LabelEncoder()
y_train_bow_num = le.fit_transform(y_train_bow)
y_test_bow_num = le.transform(y_test_bow)
y_val_bow_num = le.transform(y_val_bow)


# Save the models and CountVectorizer as pickle files
with open('logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(model1, file)

with open('count_vectorizer.pkl', 'wb') as file:
    pickle.dump(bow_counts, file)

# Download the pickle files
# from google.colab import files
# files.download('logistic_regression_model.pkl')
# files.download('count_vectorizer.pkl')


[nltk_data] Downloading package stopwords to C:\Users\Hari
[nltk_data]     Haran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Hari
[nltk_data]     Haran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy:  0.8960969404833634
              precision    recall  f1-score   support

  Irrelevant       0.96      0.83      0.89      2661
    Negative       0.91      0.92      0.91      4471
     Neutral       0.90      0.88      0.89      3551
    Positive       0.84      0.93      0.89      4254

    accuracy                           0.90     14937
   macro avg       0.91      0.89      0.90     14937
weighted avg       0.90      0.90      0.90     14937

Accuracy:  97.997997997998


In [5]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk import word_tokenize
import re

# Load the logistic regression model
with open('logistic_regression_model.pkl', 'rb') as file:
    logistic_model = pickle.load(file)

# Sample new data
new_data = ["I'm tired and finished playing this borderland.", "Another example of a tweet."]

# Preprocess the new data
def preprocess(text):
    text = text.lower()
    text = re.sub('[^A-Za-z0-9 ]+', ' ', text)
    return text

new_data = [preprocess(text) for text in new_data]

# Transform the new data using CountVectorizer
# Make sure to use the same vectorizer fitted on the training data
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    stop_words=nltk.corpus.stopwords.words('english'),
    ngram_range=(1, 4)
)

reviews_train =  pd.read_csv("twitter_training.csv")
reviews_train.columns = ['id', 'information', 'type', 'text']

reviews_train["lower"]=reviews_train.text.str.lower() #lowercase
reviews_train["lower"]=[str(data) for data in reviews_train.lower] #converting all to string
reviews_train["lower"]=reviews_train.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) #regex

#train data spilt

reviews_train, reviews_test = train_test_split(reviews_train,test_size=0.2, random_state = 42)
# Assuming `reviews_train` is available from the training phase
X_train_bow = bow_counts.fit_transform(reviews_train.lower)

# Transform the new data
X_new_data_bow = bow_counts.transform(new_data)

# Make predictions using logistic regression
logistic_predictions = logistic_model.predict(X_new_data_bow)
print("Logistic Regression Predictions:", logistic_predictions)






Logistic Regression Predictions: ['Negative' 'Irrelevant']
