In [4]:
import pandas as pd

train_df = pd.read_csv('/content/train.csv', encoding='latin1')
test_df = pd.read_csv('/content/test.csv', encoding='latin1')


print("Train sentiment classes:")
print(train_df['sentiment'].unique())
print(test_df['sentiment'].unique())


Train sentiment classes:
['neutral' 'negative' 'positive']
['neutral' 'positive' 'negative' nan]


In [None]:
print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)


In [6]:
print("Train DataFrame info:")
train_df.info()

print("Test DataFrame info:")
test_df.info()


Train DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB
Test DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4815 entries, 0 to 4814
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID           

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


train_df = pd.read_csv('/content/train.csv', encoding='latin1')
test_df = pd.read_csv('/content/test.csv', encoding='latin1')


train_df.dropna(subset=['text', 'selected_text'], inplace=True)
test_df['text'].fillna('', inplace=True)
test_df['sentiment'].fillna('neutral', inplace=True)
test_df['Country'].fillna('Unknown', inplace=True)
test_df['Population -2020'].fillna(test_df['Population -2020'].mean(), inplace=True)
test_df['Land Area (Km²)'].fillna(test_df['Land Area (Km²)'].mean(), inplace=True)
test_df['Density (P/Km²)'].fillna(test_df['Density (P/Km²)'].mean(), inplace=True)
most_common_time_of_tweet = test_df['Time of Tweet'].mode()[0] if not test_df['Time of Tweet'].mode().empty else 'Unknown'
test_df['Time of Tweet'].fillna(most_common_time_of_tweet, inplace=True)
most_common_age_of_user = test_df['Age of User'].mode()[0] if not test_df['Age of User'].mode().empty else 'Unknown'
test_df['Age of User'].fillna(most_common_age_of_user, inplace=True)


label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['sentiment'])
y_test = label_encoder.transform(test_df['sentiment'])


vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])


log_reg = LogisticRegression(max_iter=1000, random_state=42)


log_reg.fit(X_train, y_train)


y_pred = log_reg.predict(X_test)


print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.64      0.68      1001
     neutral       0.80      0.87      0.83      2711
    positive       0.81      0.71      0.76      1103

    accuracy                           0.79      4815
   macro avg       0.78      0.74      0.76      4815
weighted avg       0.79      0.79      0.78      4815

Accuracy: 0.7875389408099689


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression


log_reg = LogisticRegression(random_state=42)


param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}


grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)


grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_model = grid_search.best_estimator_

print("Best parameters found: ", best_params)
print("Best cross-validation accuracy: {:.2f}".format(best_score))


y_pred = best_model.predict(X_test)


print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))


Best parameters found:  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy: 0.71
Classification Report:
              precision    recall  f1-score   support

    negative       0.76      0.64      0.69      1001
     neutral       0.80      0.88      0.84      2711
    positive       0.81      0.74      0.77      1103

    accuracy                           0.80      4815
   macro avg       0.79      0.75      0.77      4815
weighted avg       0.79      0.80      0.79      4815

Accuracy: 0.7958463136033229


In [9]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score


rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)

print("Random Forest Classification Report:")
print(classification_report(y_test, rf_y_pred, target_names=label_encoder.classes_))
print("Accuracy with Random Forest:", accuracy_score(y_test, rf_y_pred))


gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)

print("Gradient Boosting Classification Report:")
print(classification_report(y_test, gb_y_pred, target_names=label_encoder.classes_))
print("Accuracy with Gradient Boosting:", accuracy_score(y_test, gb_y_pred))


Random Forest Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.57      0.64      1001
     neutral       0.78      0.88      0.82      2711
    positive       0.79      0.70      0.74      1103

    accuracy                           0.77      4815
   macro avg       0.77      0.71      0.73      4815
weighted avg       0.77      0.77      0.77      4815

Accuracy with Random Forest: 0.7717549325025961
Gradient Boosting Classification Report:
              precision    recall  f1-score   support

    negative       0.78      0.40      0.53      1001
     neutral       0.71      0.92      0.80      2711
    positive       0.82      0.58      0.68      1103

    accuracy                           0.74      4815
   macro avg       0.77      0.64      0.67      4815
weighted avg       0.75      0.74      0.72      4815

Accuracy with Gradient Boosting: 0.7362409138110073


In [10]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D


tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['text'])
X_train_seq = tokenizer.texts_to_sequences(train_df['text'])
X_test_seq = tokenizer.texts_to_sequences(test_df['text'])

X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(train_df['sentiment'])
y_test_encoded = label_encoder.transform(test_df['sentiment'])


model = Sequential()
model.add(Embedding(5000, 128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


history = model.fit(X_train_pad, y_train_encoded, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test_encoded))


y_pred_prob = model.predict(X_test_pad)
y_pred = y_pred_prob.argmax(axis=1)

print("LSTM Classification Report:")
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))
print("Accuracy with LSTM:", accuracy_score(y_test_encoded, y_pred))


Epoch 1/5




[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 218ms/step - accuracy: 0.5208 - loss: 0.9481 - val_accuracy: 0.7807 - val_loss: 0.5969
Epoch 2/5
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 223ms/step - accuracy: 0.7461 - loss: 0.6237 - val_accuracy: 0.7988 - val_loss: 0.5360
Epoch 3/5
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 217ms/step - accuracy: 0.7805 - loss: 0.5441 - val_accuracy: 0.8048 - val_loss: 0.5274
Epoch 4/5
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 213ms/step - accuracy: 0.7976 - loss: 0.5117 - val_accuracy: 0.8046 - val_loss: 0.5336
Epoch 5/5
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 216ms/step - accuracy: 0.8197 - loss: 0.4675 - val_accuracy: 0.8004 - val_loss: 0.5657
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step
LSTM Classification Report:
              precision    recall  f1-score   support

    negative       0.71    

In [12]:
from keras.models import load_model


model.save('lstm_model.keras')



In [22]:
pip install tensorflow




In [24]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense


In [26]:
pip install numpy pandas scikit-learn




In [27]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense


vocab_size = 5000
embedding_dim = 50
max_length = 100


tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_df['text'])

X_train_seq = tokenizer.texts_to_sequences(train_df['text'])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length)

X_test_seq = tokenizer.texts_to_sequences(test_df['text'])
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length)


model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(Flatten())
model.add(Dense(3, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


model.fit(X_train_padded, y_train, epochs=5, validation_split=0.2)


Epoch 1/5




[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.5141 - loss: 0.9574 - val_accuracy: 0.6961 - val_loss: 0.7095
Epoch 2/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 24ms/step - accuracy: 0.7494 - loss: 0.6221 - val_accuracy: 0.7182 - val_loss: 0.6915
Epoch 3/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 24ms/step - accuracy: 0.8143 - loss: 0.4948 - val_accuracy: 0.7021 - val_loss: 0.7409
Epoch 4/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - accuracy: 0.8694 - loss: 0.3749 - val_accuracy: 0.6890 - val_loss: 0.8433
Epoch 5/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 24ms/step - accuracy: 0.9197 - loss: 0.2526 - val_accuracy: 0.6729 - val_loss: 0.9773


<keras.src.callbacks.history.History at 0x7f57f642e440>

In [29]:
model.save('my_model.keras')