<a href="https://colab.research.google.com/github/Meenusj/deepfake-detection/blob/main/training_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pandas scikit-learn nltk fasttext tensorflow

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199773 sha256=f5a28443d405c172c6d04c49e75bf0178683740acef5e9b59ca5c7e80fd791c2
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [8]:
# Import NLTK and download 'punkt' and 'stopwords'
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import pickle
from sklearn.metrics import confusion_matrix, classification_report

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import fasttext
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Dense

# Load your dataset (replace 'train.csv' with your actual dataset file)
# Load the CSV file
csv_file_path = 'train.csv'
df = pd.read_csv(csv_file_path, sep=';')


# Print column names to identify the correct column
print(df.columns)

# Assuming the correct column name is 'text', replace 'tweet_text' with the actual name
def preprocess(text):
    # Tokenization
    tokens = text.split()

    # Case conversion
    tokens = [word.lower() for word in tokens]

    # Remove hashtags and usernames
    tokens = [re.sub(r'#\w+|@\w+', '', word) for word in tokens]

    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)

# Apply pre-processing to the specified column in the DataFrame
# Replace 'tweet_text' with the actual column name, which is 'text' in this case
df['preprocessed_text'] = df['text'].apply(preprocess)

# Save the preprocessed DataFrame to a new CSV file
preprocessed_csv_path = 'preprocessed_dataset.csv'
df.to_csv(preprocessed_csv_path, index=False)

print(f"Preprocessed data saved to {preprocessed_csv_path}")

# Load the preprocessed dataset
preprocessed_csv_path = 'preprocessed_dataset.csv'
df = pd.read_csv(preprocessed_csv_path)

# Check for NaN values and replace them with an empty string
df['preprocessed_text'].fillna('', inplace=True)

# Check for non-string values and convert them to strings
df['preprocessed_text'] = df['preprocessed_text'].astype(str)

# Tokenize the text
tokenized_text = [word_tokenize(text) for text in df['preprocessed_text']]

# Save tokenized text to a text file (required format for FastText)
with open('tokenized_text.txt', 'w') as file:
    for tokens in tokenized_text:
        file.write(" ".join(tokens) + "\n")

# Train FastText model
model = fasttext.train_unsupervised('tokenized_text.txt', model='skipgram', dim=300, epoch=10)

# Save the model
model.save_model('fasttext_model.bin')

# Get word vectors for each token
word_vectors = [model.get_word_vector(word) for tokens in tokenized_text for word in tokens]

# Convert word vectors to DataFrame
word_vectors_df = pd.DataFrame(word_vectors, columns=[f'feature_{i}' for i in range(300)])

# Concatenate the original DataFrame with the word vectors DataFrame
df_with_vectors = pd.concat([df, word_vectors_df], axis=1)

# Save the DataFrame with additional columns for word vectors
df_with_vectors.to_csv('df_with_vectors.csv', index=False)


# Load your DataFrame without header
df = pd.read_csv('df_with_vectors.csv')
print(df.columns)

# Extract feature columns (assuming they start from column 'feature_0')
feature_columns = df.columns[df.columns.str.startswith('feature_')]

# Extract features and labels
X = df[feature_columns].values
y = df['class_type']
# Encode class labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['class_type'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test_actual = train_test_split(X,y_encoded, test_size=0.2, random_state=42)

# Reshape the input data to be compatible with Conv1D layer
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define and compile your model
model_cnn = Sequential()
model_cnn.add(Conv1D(filters=64, kernel_size=7, activation='relu', input_shape=(X_train.shape[1], 1)))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(32, activation='relu'))
model_cnn.add(Dense(len(label_encoder.classes_), activation='softmax'))
model_cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model_cnn.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate the model
y_pred_probs = model_cnn.predict(X_test)
y_pred_classes = y_pred_probs.argmax(axis=-1)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_actual, y_pred_classes)
precision = precision_score(y_test_actual, y_pred_classes, average='weighted')
recall = recall_score(y_test_actual, y_pred_classes, average='weighted')
f1 = f1_score(y_test_actual, y_pred_classes, average='weighted')


# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test_actual, y_pred_classes)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Create a classification report
class_report = classification_report(y_test_actual, y_pred_classes)

# Print the classification report
print("Classification Report:")
print(class_report)

# Save the model including architecture, optimizer, and learned weights
model_cnn.save('deepfake_model.h5')

# Save the label encoder for future use
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Index(['screen_name', 'text', 'account.type', 'class_type'], dtype='object')
Preprocessed data saved to preprocessed_dataset.csv


  df = pd.read_csv('df_with_vectors.csv')


Index(['screen_name', 'text', 'account.type', 'class_type',
       'preprocessed_text', 'feature_0', 'feature_1', 'feature_2', 'feature_3',
       'feature_4',
       ...
       'feature_290', 'feature_291', 'feature_292', 'feature_293',
       'feature_294', 'feature_295', 'feature_296', 'feature_297',
       'feature_298', 'feature_299'],
      dtype='object', length=305)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.8749
Precision: 0.7655
Recall: 0.8749
F1 Score: 0.8165
Confusion Matrix:
[[    0     0     0     0   614]
 [    0     0     0     0  2030]
 [    0     0     0     0   725]
 [    0     0     0     0   668]
 [    0     0     0     0 28235]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       614
           1       0.00      0.00      0.00      2030
           2       0.00      0.00      0.00       725
           3       0.00      0.00      0.00       668
           4       0.87     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  saving_api.save_model(
