In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv("C:\\Users\\lenovo\\Downloads\\balanced_processed_covid19_tweets_copy.csv")


In [2]:
df.tail()

Unnamed: 0,text,sentiment
2332,case death today according worldometers anothe...,Anger
2333,nipostngn impose stupid inconsiderate charge l...,Anger
2334,sashadarapper bitch choked netflix go get job ...,Anger
2335,sad ugly triage texas covid tragedy unfold way,Anger
2336,republican ignore million american threat evic...,Anger


In [3]:
replacement_map = {
    'Joy': 'positive',
    'Fear': 'negative',
    'Sad': 'negative',
    'Anger': 'negative'
}

# Replace values in 'sentiment' column
df['sentiment'].replace(replacement_map, inplace=True)

# Check the updated DataFrame
print(df)

                                                   text sentiment
0     covid change work general recruiting specifica...   Neutral
1     wear face covering shopping includes visit loc...   Neutral
2     order logo graphicdesigner logodesign logodesi...   Neutral
3     rajasthan government today started plasma bank...   Neutral
4     nagaland police covid awareness city tower jun...   Neutral
...                                                 ...       ...
2332  case death today according worldometers anothe...  negative
2333  nipostngn impose stupid inconsiderate charge l...  negative
2334  sashadarapper bitch choked netflix go get job ...  negative
2335     sad ugly triage texas covid tragedy unfold way  negative
2336  republican ignore million american threat evic...  negative

[2337 rows x 2 columns]


In [4]:
# Count occurrences of each sentiment
sentiment_counts = df['sentiment'].value_counts()

# Find the minimum count among all sentiments
min_count = sentiment_counts.min()

# Filter rows for each sentiment category
positive_rows = df[df['sentiment'] == 'positive']
neutral_rows = df[df['sentiment'] == 'Neutral']
negative_rows = df[df['sentiment'] == 'negative']

# Sample rows if the count is greater than the minimum count
if len(positive_rows) > min_count:
    positive_rows = positive_rows.sample(n=min_count, random_state=42)
if len(neutral_rows) > min_count:
    neutral_rows = neutral_rows.sample(n=min_count, random_state=42)
if len(negative_rows) > min_count:
    negative_rows = negative_rows.sample(n=min_count, random_state=42)

# Concatenate the sampled rows
balanced_df = pd.concat([positive_rows, neutral_rows, negative_rows])

# Shuffle the DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the balanced DataFrame
print(balanced_df['sentiment'].value_counts())



positive    457
Neutral     457
negative    457
Name: sentiment, dtype: int64


In [5]:
print(balanced_df)

                                                   text sentiment
0     sound like plandemic get second life here thre...  positive
1              latest islam islamism berkleyforum covid   Neutral
2     staysafe summertime covid challenge put much s...  positive
3     dear realdonaldtrump covid vaccine go market p...  positive
4     wash hand clean hand save life download free g...  positive
...                                                 ...       ...
1366  badbitchinaz id pay republican leave america v...  negative
1367  washington demanding australian government par...  negative
1368  toonceslives resisting bitch face resist covid...  negative
1369  tussfc told trial consist people getting covid...   Neutral
1370         nypost high temp humidity kill covid virus  negative

[1371 rows x 2 columns]


In [6]:
df=balanced_df

In [7]:
# Define labels mapping
label_mapping = {'positive': 0, 'Neutral': 1, 'negative': 2}

# Convert sentiment labels to numerical labels
df['sentiment'] = df['sentiment'].map(label_mapping)

In [8]:
df.head()

Unnamed: 0,text,sentiment
0,sound like plandemic get second life here thre...,0
1,latest islam islamism berkleyforum covid,1
2,staysafe summertime covid challenge put much s...,0
3,dear realdonaldtrump covid vaccine go market p...,0
4,wash hand clean hand save life download free g...,0


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer


corpus = df['text'].tolist()

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100)  

# Fit and transform the corpus
tfidf_features = tfidf_vectorizer.fit_transform(corpus)

# Convert TF-IDF features to an array
tfidf_features = tfidf_features.toarray()

In [10]:
import numpy as np
import pandas as pd



# Load GloVe word vectors
def load_glove_embeddings(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        word_to_vec = {}
        for line in f:
            values = line.split()
            word = values[0]
            vec = np.array(values[1:], dtype='float32')
            word_to_vec[word] = vec
    return word_to_vec

glove_embeddings = load_glove_embeddings("C:\\Users\\lenovo\\My_Folders\\7th_sem\\IT350\\IT350_Project\\glove.6B.100d.txt")

# Convert tweets to GloVe embeddings
def get_average_embedding(tweet, word_embeddings):
    words = tweet.split()
    embeddings = [word_embeddings.get(word, np.zeros(100)) for word in words]
    return np.mean(embeddings, axis=0)

# Apply the embedding function to the 'text' column
glove_features = df['text'].apply(lambda x: get_average_embedding(x, glove_embeddings))

# Now, 'glove_features' contains the GloVe embeddings for each tweet
# It's a Series of NumPy arrays

# Convert 'glove_features' to a NumPy array
glove_features = np.vstack(glove_features)

In [11]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize




# Tokenize tweets
tokenized_tweets = [word_tokenize(tweet.lower()) for tweet in df['text']]

# Train Word2Vec model
word2vec_model = Word2Vec(tokenized_tweets, vector_size=100, window=5, min_count=1, sg=0)

# Function to get the word embedding for a tweet
def get_average_word2vec_embedding(tweet, word2vec_model):
    words = word_tokenize(tweet.lower())
    embeddings = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(100)

# Apply the embedding function to the 'text' column
word2vec_features = df['text'].apply(lambda x: get_average_word2vec_embedding(x, word2vec_model))

# Convert 'word2vec_features' to a NumPy array
word2vec_features = np.vstack(word2vec_features)

In [12]:
from gensim.models import FastText
from gensim.utils import simple_preprocess
import pandas as pd



# Train a FastText model
model = FastText(sentences=df['text'].apply(lambda x: simple_preprocess(x)), vector_size=100, window=5, min_count=1, sg=1)

# Function to get the FastText embedding for a sentence
def get_fasttext_embedding(sentence):
    words = simple_preprocess(sentence)
    return sum([model.wv[word] for word in words if word in model.wv])

# Apply the function to your DataFrame and get the embeddings separately
fasttext_embeddings = df['text'].apply(get_fasttext_embedding)

# Now, fasttext_embeddings is a Series containing FastText-like embeddings for each tweet.
# Each element of the series is a numpy array representing the embedding.
fasttext_embeddings = np.vstack(fasttext_embeddings)

In [13]:
df['tfidf_features']=tfidf_features.tolist()
df['glove_features']=glove_features.tolist()
df['word2vec_features']=word2vec_features.tolist()
df['fasttext_embeddings']=fasttext_embeddings.tolist()
df.head()

Unnamed: 0,text,sentiment,tfidf_features,glove_features,word2vec_features,fasttext_embeddings
0,sound like plandemic get second life here thre...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.16007492433373743, 0.3628024634403678, 0.3...","[0.0003457685234025121, 0.006037031300365925, ...","[-1.0274704694747925, 2.6667635440826416, -3.4..."
1,latest islam islamism berkleyforum covid,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.09475399851799012, 0.10903059989213944, 0.0...","[-0.001335539622232318, 0.008357246406376362, ...","[-0.2983933985233307, 0.7834877967834473, -1.0..."
2,staysafe summertime covid challenge put much s...,0,"[0.0, 0.0, 0.0, 0.5785987173475471, 0.0, 0.0, ...","[-0.3035103985323356, 0.22403684530693752, 0.2...","[-0.003014534479007125, 0.0006508661899715662,...","[-1.0606491565704346, 2.7465343475341797, -3.5..."
3,dear realdonaldtrump covid vaccine go market p...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0025485828518867493, 0.12183674859503905, 0...","[-0.0010916515020653605, 0.0020927099976688623...","[-1.1256067752838135, 2.9913766384124756, -3.8..."
4,wash hand clean hand save life download free g...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.18380731344223022, 0.11013955622911453, 0....","[0.0009444393217563629, 0.0031865646596997976,...","[-0.9953127503395081, 2.553715229034424, -3.34..."


In [14]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import GRU, Dense, Dropout, Embedding, GlobalMaxPooling1D
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.utils import to_categorical

# # Assuming your data is in a DataFrame called 'df'

# # Combine all features into one array
# X_tfidf = np.array(df['tfidf_features'].tolist())
# X_glove = np.array(df['glove_features'].tolist())
# X_word2vec = np.array(df['word2vec_features'].tolist())
# X_fasttext = np.array(df['fasttext_embeddings'].tolist())

# # Concatenate features
# X = np.concatenate((X_tfidf, X_glove, X_word2vec, X_fasttext), axis=1)

# # Assuming you have a 'sentiment' column in your DataFrame
# y = np.array(df['sentiment'])

# # Convert labels to one-hot encoded vectors
# y = to_categorical(y, num_classes=3)  # 3 classes in total

# # Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Pad sequences individually
# max_len = 100  # Define your maximum sequence length
# X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')  
# X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# # Build GRU model
# model = Sequential()
# model.add(Embedding(input_dim=X.shape[1], output_dim=128, input_length=max_len))  # Adjust output_dim as needed
# model.add(GRU(64, return_sequences=True))  # Adjust GRU units as needed
# model.add(GlobalMaxPooling1D())
# model.add(Dropout(0.5))
# model.add(Dense(32, activation='relu'))  # Additional dense layer
# model.add(Dense(3, activation='softmax'))  # Output layer with 3 classes

# # Compile the model
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# # Train the model
# model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test))


In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Concatenate, Dense, Dropout
from sklearn.preprocessing import StandardScaler

# Assuming your data is in a DataFrame called 'df'

# Combine all features into one array
X_tfidf = np.array(df['tfidf_features'].tolist())
X_glove = np.array(df['glove_features'].tolist())
X_word2vec = np.array(df['word2vec_features'].tolist())
X_fasttext = np.array(df['fasttext_embeddings'].tolist())

# Concatenate features
X = np.concatenate((X_tfidf, X_glove, X_word2vec, X_fasttext), axis=1)
# X = np.concatenate((X_glove, X_word2vec), axis=1)

# Assuming you have a 'sentiment' column in your DataFrame
y = np.array(df['sentiment'])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=40)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build a neural network
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='sigmoid'))  # Output layer with 3 classes

# Compile the model
model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x23eb56f6a60>

In [16]:
y_train

array([0, 0, 2, ..., 1, 1, 2], dtype=int64)

In [17]:
p=0
n=0
neg=0
for x in y_train:
    if(x==0):
        p+=1
    if(x==1):
        n+=1
    if(x==2):
        neg+=1
print("Pos : ",p)
print("Neutral : ",n)
print("Neg : ",neg)
        

Pos :  367
Neutral :  365
Neg :  364


In [18]:
y_test

array([2, 1, 1, 0, 2, 2, 1, 0, 0, 0, 1, 1, 0, 1, 2, 2, 2, 1, 0, 2, 0, 1,
       0, 0, 0, 2, 0, 1, 2, 2, 0, 1, 0, 0, 0, 2, 1, 2, 0, 2, 0, 0, 0, 2,
       2, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 2, 0, 2, 2, 0, 0, 1,
       1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2, 0, 0, 2, 0, 1, 1, 0, 1, 0,
       2, 1, 1, 1, 2, 0, 0, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0, 0, 2, 0, 2, 1,
       1, 1, 1, 2, 2, 2, 0, 1, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 0, 0,
       0, 0, 0, 2, 1, 1, 1, 0, 0, 2, 0, 0, 2, 1, 1, 0, 0, 0, 2, 1, 0, 2,
       0, 2, 1, 2, 0, 0, 0, 0, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1,
       2, 2, 0, 1, 1, 2, 0, 0, 1, 2, 2, 1, 1, 0, 1, 1, 2, 2, 2, 0, 2, 1,
       0, 1, 1, 0, 1, 2, 2, 2, 1, 2, 0, 0, 2, 2, 0, 2, 1, 0, 1, 1, 2, 2,
       2, 0, 0, 2, 2, 1, 2, 2, 1, 0, 1, 1, 2, 0, 2, 1, 0, 1, 1, 2, 1, 1,
       1, 0, 1, 1, 2, 1, 1, 2, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 2, 0, 0, 2,
       2, 0, 0, 1, 1, 2, 0, 2, 2, 0, 2], dtype=int64)

In [19]:
p=0
n=0
neg=0
for x in y_test:
    if(x==0):
        p+=1
    if(x==1):
        n+=1
    if(x==2):
        neg+=1
print("Pos : ",p)
print("Neutral : ",n)
print("Neg : ",neg)

Pos :  90
Neutral :  92
Neg :  93
