In [None]:
from google.colab import drive
drive.mount('/content/drive')

CLUSTERING AND CLASSIFICATION

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# SVM
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MaxAbsScaler
import numpy as np
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# DNN and LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

# AdaBOOST
from sklearn.ensemble import AdaBoostClassifier

# LightGBM
import lightgbm as lgb

import xgboost as xgb
from xgboost import XGBClassifier

# decision tree, random forest, svm, naive bayes, LR, LightGBM, AdaBoost, XGBoost, DNN, LSTM

Load Dataset

In [None]:
csv_file_path = ''  # Replace with the path to your CSV file
df = pd.read_csv(csv_file_path)

In [None]:
X=[]
y=[]
for tweet,category in zip(df['tweets'],df['category']):
    if not (pd.isnull(category)):
      X.append(tweet)
      y.append(category)


Preprocessing the tweets

In [None]:
category_dict = {category: idx for idx, category in enumerate(['disaster', 'areas', 'help', 'transport', 'time', 'health',
       'victim', 'crops', 'building', 'education', 'water', 'weather',
       'quantity', 'food', 'clothing'])}

In [None]:
category_dict

In [None]:
# display random tweets
print(df['tweets'][145])
print(df['tweets'][1875])
print(df['tweets'][2465])
print(df['tweets'][100])

In [None]:
df['category'][:20000].value_counts()

In [None]:
df['category'].value_counts()

In [None]:
tweets = X

Vectorization

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tweets)

In [None]:
tweets = tfidf_matrix

In [None]:
feature_names = vectorizer.get_feature_names_out()

In [None]:
print(tfidf_matrix.toarray())

In [None]:
print(feature_names)

Perform label encoding on categories

In [None]:
le = LabelEncoder()

In [None]:
le.fit(y)
y = le.transform(y)

In [None]:
import numpy as np

# Assuming `y` is your array of labels
unique_labels, label_counts = np.unique(y, return_counts=True)

# Print the counts
for label, count in zip(unique_labels, label_counts):
    print(f"Label {label}: {count} occurrences")

split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets, y , test_size=0.2, random_state=42)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))

SGD Classifier

In [None]:
model = SGDClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

Naive Baiyes

In [None]:
model = MultinomialNB()
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_pred,y_test))
print(accuracy_score(y_pred,y_test))

Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

DNN Model

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [None]:
print(sequences[8])

In [None]:
max_len = max([len(sequence) for sequence in sequences])

In [None]:
X_padded = pad_sequences(sequences,maxlen=max_len,padding='post')

In [None]:
print(X_padded[8])

In [None]:
num_classes = len(category_dict)

In [None]:
X_padded.shape

In [None]:
y.shape

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [None]:
# Define the sequential neural model
model = tf.keras.Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=512, input_length=max_len),
    Flatten(),
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=3, batch_size=40, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)

In [None]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

LSTM MODEL

In [None]:
model = tf.keras.Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=512, input_length=max_len),
    tf.keras.layers.LSTM(64, activation='relu', return_sequences=True),
    tf.keras.layers.LSTM(32, activation='relu',return_sequences=True),
    tf.keras.layers.LSTM(16, activation='relu',return_sequences=True),
    tf.keras.layers.LSTM(8, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train, epochs=3, batch_size=40, verbose=1)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)

In [None]:
y_pred = model.predict(X_test)
# # Convert probabilities to class predictions
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test,y_pred))
# 4 lstm layers 64,32,16,8 2 dense layers 512,256

ADABOOST

In [None]:
# tf-idf vectorised
print(X)

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tweets, y, test_size=0.18, random_state=42)

In [None]:
# Initialize the AdaBoost classifier
adaboost_classifier = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost_classifier.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = adaboost_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


LightGBM

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets, y, test_size=0.2, random_state=42)

In [None]:
lgb_classifier = lgb.LGBMClassifier(objective='multiclass', num_class=15)
lgb_classifier.fit(X_train, y_train)

In [None]:
# Step 4: Model evaluation
y_pred = lgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

XGBoost

In [None]:
dtrain = xgb.DMatrix(data=X_train,label=y_train)
dtest = xgb.DMatrix(data=X_test)

In [None]:
# declare parameters
params = {
            'objective':'multi:softmax',
            'max_depth': 4,
            'alpha': 10,
            'num_class':15,
            'n_estimators':100
        }
clf = xgb.train(params,dtrain)

In [None]:
y_pred = clf.predict(dtest)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))