<a href="https://colab.research.google.com/github/Meenusj/Case_study/blob/main/trainlstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install FastText

Collecting FastText
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m993.3 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from FastText)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: FastText
  Building wheel for FastText (setup.py) ... [?25l[?25hdone
  Created wheel for FastText: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199773 sha256=56dd248d1f390598e671446bc1af0f6d1f8877ad815bdfefb3cffe2fc6f0a082
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built FastText
Installing collected packages: pybind11, FastText
Successfully installed FastText-0.9.2 pybind11-2.11.1


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import pickle
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import fasttext
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Dense, LSTM

# Load your dataset (replace 'train.csv' with your actual dataset file)
csv_file_path = 'train.csv'
df = pd.read_csv(csv_file_path, sep=';')

# Assuming the correct column name is 'text', replace 'tweet_text' with the actual name
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    tokens = [re.sub(r'#\w+|@\w+', '', word) for word in tokens]
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply pre-processing to the specified column in the DataFrame
df['preprocessed_text'] = df['text'].apply(preprocess)

# Save the preprocessed DataFrame to a new CSV file
preprocessed_csv_path = 'preprocessed_dataset.csv'
df.to_csv(preprocessed_csv_path, index=False)

print(f"Preprocessed data saved to {preprocessed_csv_path}")

# Load the preprocessed dataset
df = pd.read_csv(preprocessed_csv_path)

# Check for NaN values and replace them with an empty string
df['preprocessed_text'].fillna('', inplace=True)
df['preprocessed_text'] = df['preprocessed_text'].astype(str)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['preprocessed_text'])
tokenized_text = tokenizer.texts_to_sequences(df['preprocessed_text'])

# Padding sequences to the same length
max_len = max(len(seq) for seq in tokenized_text)
X_padded = pad_sequences(tokenized_text, maxlen=max_len)

# Save tokenized text to a text file (required format for FastText)
with open('tokenized_text.txt', 'w') as file:
    for tokens in tokenized_text:
        file.write(" ".join(map(str, tokens)) + "\n")

# Train FastText model
model = fasttext.train_unsupervised('tokenized_text.txt', model='skipgram', dim=300, epoch=10)

# Save the model
model.save_model('fasttext_model_lstm.bin')


# Get word vectors for each token
word_vectors = []
for tokens in tokenized_text:
    for word_index in tokens:
        word = tokenizer.index_word.get(word_index)
        if word:
            word_vector = model.get_word_vector(word)
            word_vectors.append(word_vector)

# Convert word vectors to DataFrame
word_vectors_df = pd.DataFrame(word_vectors, columns=[f'feature_{i}' for i in range(300)])



# Concatenate the original DataFrame with the word vectors DataFrame
df_with_vectors = pd.concat([df, word_vectors_df], axis=1)

# Save the DataFrame with additional columns for word vectors
df_with_vectors.to_csv('df_with_vectors.csv', index=False)

# Load your DataFrame without header
df = pd.read_csv('df_with_vectors.csv')

# Extract feature columns (assuming they start from column 'feature_0')
feature_columns = df.columns[df.columns.str.startswith('feature_')]

# Extract features and labels
X = df[feature_columns].values
y = df['class_type']

# Encode class labels using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Reshape the input data to be compatible with Conv1D layer
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Create the LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(100, input_shape=(X_train.shape[1], 1)))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(32, activation='relu'))
# Assuming num_classes is the number of unique labels in your dataset
num_classes = len(set(y_train))
model_lstm.add(Dense(num_classes, activation='softmax'))

# Compile the model
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model summary
model_lstm.summary()

# Train the LSTM model
model_lstm.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, shuffle=True)

# Evaluate the model on the test set
loss, accuracy = model_lstm.evaluate(X_test, y_test_encoded)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Convert predictions back to original class labels
import numpy as np

# Predict probabilities for each class
y_test_probs = model_lstm.predict(X_test)

# Find the class with the highest probability for each sample
y_test_pred = np.argmax(y_test_probs, axis=1)

y_test_pred_original = label_encoder.inverse_transform(y_test_pred)

# Save the trained LSTM model
model_lstm.save('lstm_model.h5')
print("LSTM model saved.")

# Save the label encoder for future use
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)


# Evaluate the model performance
precision = precision_score(y_test_encoded, y_test_pred, average='weighted')
recall = recall_score(y_test_encoded, y_test_pred, average='weighted')
f1 = f1_score(y_test_encoded, y_test_pred, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Display confusion matrix and classification report
conf_matrix = confusion_matrix(y_test_encoded, y_test_pred)
class_report = classification_report(y_test_encoded, y_test_pred)

print("Confusion Matrix:")
print(conf_matrix)

print("\nClassification Report:")
print(class_report)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessed data saved to preprocessed_dataset.csv


  df = pd.read_csv('df_with_vectors.csv')


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 100)               40800     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 32)                3232      
                                                                 
 dense_1 (Dense)             (None, 5)                 165       
                                                                 
Total params: 44197 (172.64 KB)
Trainable params: 44197 (172.64 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.4531
Accuracy: 0.8990


  saving_api.save_model(
  _warn_prf(average, modifier, msg_start, len(result))


LSTM model saved.
Precision: 0.8082
Recall: 0.8990
F1 Score: 0.8512
Confusion Matrix:
[[    0     0     0     0   611]
 [    0     0     0     0  2055]
 [    0     0     0     0   758]
 [    0     0     0     0   673]
 [    0     0     0     0 36470]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       611
           1       0.00      0.00      0.00      2055
           2       0.00      0.00      0.00       758
           3       0.00      0.00      0.00       673
           4       0.90      1.00      0.95     36470

    accuracy                           0.90     40567
   macro avg       0.18      0.20      0.19     40567
weighted avg       0.81      0.90      0.85     40567



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Save the label encoder for future use
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)