<a href="https://colab.research.google.com/github/Meenusj/deepfake-detection/blob/main/trainingwithlstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas scikit-learn nltk fasttext tensorflow

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199772 sha256=9826afd78d5b9314f12b74f374fee8fc422b5747dd6748bf2677d733000cbe02
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
import fasttext
import re

# Load your dataset (replace 'train.csv' with your actual dataset file)
csv_file_path = 'train.csv'
df = pd.read_csv(csv_file_path, sep=';')

# Print column names to identify the correct column
print(df.columns)

# Assuming the correct column name is 'text'
def preprocess(text):
    tokens = text.split()
    tokens = [word.lower() for word in tokens]
    tokens = [re.sub(r'#\w+|@\w+', '', word) for word in tokens]
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply pre-processing to the 'text' column
df['preprocessed_text'] = df['text'].apply(preprocess)

# Save the preprocessed DataFrame to a new CSV file
preprocessed_csv_path = 'preprocessed_dataset.csv'
df.to_csv(preprocessed_csv_path, index=False)
print(f"Preprocessed data saved to {preprocessed_csv_path}")

# Load the preprocessed dataset
df = pd.read_csv(preprocessed_csv_path)

# Check for NaN values and replace them with an empty string
df['preprocessed_text'].fillna('', inplace=True)

# Check for non-string values and convert them to strings
df['preprocessed_text'] = df['preprocessed_text'].astype(str)

# Tokenize the text
tokenized_text = [word_tokenize(text) for text in df['preprocessed_text']]

# Save tokenized text to a text file (required format for FastText)
with open('tokenized_text.txt', 'w') as file:
    for tokens in tokenized_text:
        file.write(" ".join(tokens) + "\n")

# Train FastText model
model_fasttext = fasttext.train_unsupervised('tokenized_text.txt', model='skipgram', dim=300, epoch=10)

# Save the FastText model
model_fasttext.save_model('fasttext_model.bin')

# Get word vectors for each token
word_vectors = [model_fasttext.get_word_vector(word) for tokens in tokenized_text for word in tokens]

# Convert word vectors to DataFrame
word_vectors_df = pd.DataFrame(word_vectors, columns=[f'feature_{i}' for i in range(300)])

# Concatenate the original DataFrame with the word vectors DataFrame
df_with_vectors = pd.concat([df, word_vectors_df], axis=1)

# Save the DataFrame with additional columns for word vectors
df_with_vectors.to_csv('df_with_vectors.csv', index=False)

# Load your DataFrame with word vectors and labels
df = pd.read_csv('df_with_vectors.csv')

# Assuming 'class_type' is your label column
X = df[df.columns[df.columns.str.startswith('feature_')]].values
y = df['class_type']

# Encode class labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test_actual = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Reshape the input data to be compatible with LSTM layer
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# Build the LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(100, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(32, activation='relu'))
model_lstm.add(Dense(np.unique(y_encoded).shape[0], activation='softmax'))

# Compile the model
model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model_lstm.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
y_pred_probs_lstm = model_lstm.predict(X_test)
y_pred_classes_lstm = np.argmax(y_pred_probs_lstm, axis=-1)

# Calculate evaluation metrics
accuracy_lstm = accuracy_score(y_test_actual, y_pred_classes_lstm)
precision_lstm = precision_score(y_test_actual, y_pred_classes_lstm, average='weighted')
recall_lstm = recall_score(y_test_actual, y_pred_classes_lstm, average='weighted')
f1_lstm = f1_score(y_test_actual, y_pred_classes_lstm, average='weighted')

# Print the evaluation metrics for the LSTM model
print(f"LSTM Model Evaluation Metrics:")
print(f"Accuracy: {accuracy_lstm:.4f}")
print(f"Precision: {precision_lstm:.4f}")
print(f"Recall: {recall_lstm:.4f}")
print(f"F1 Score: {f1_lstm:.4f}")

# Save the LSTM model
model_lstm.save('lstm_model.h5')

# Save the label encoder for future use
with open('label_encoder_lstm.pkl', 'wb') as le_file_lstm:
    pickle.dump(label_encoder, le_file_lstm)

# Load the saved LSTM model
loaded_model_lstm = load_model('lstm_model.h5')

# Load the saved label encoder
with open('label_encoder_lstm.pkl', 'rb') as le_file_lstm:
    loaded_label_encoder = pickle.load(le_file_lstm)

# Predict using the loaded LSTM model
# Note: You need to preprocess and vectorize new text data similar to the training data
# Then, use the loaded model and label encoder for predictions
# ...

# Alternatively, you can reuse the preprocessing and vectorization code used for the training data
# and then make predictions using the loaded model and label encoder
# ...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Index(['screen_name', 'text', 'account.type', 'class_type'], dtype='object')
Preprocessed data saved to preprocessed_dataset.csv


  df = pd.read_csv('df_with_vectors.csv')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


  _warn_prf(average, modifier, msg_start, len(result))
  saving_api.save_model(


LSTM Model Evaluation Metrics:
Accuracy: 0.8749
Precision: 0.7655
Recall: 0.8749
F1 Score: 0.8165
