<a href="https://colab.research.google.com/github/Meenusj/Case_study/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199775 sha256=9491d8619f9e8032b3e0d3cbadf203729e6265b96d281c784f81260d83cefb5d
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gzip -d cc.en.300.bin.gz


--2024-02-18 13:28:42--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.219.59, 13.227.219.33, 13.227.219.70, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.219.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2024-02-18 13:29:13 (139 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [None]:
import pickle
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import load_model
import nltk
from nltk.tokenize import word_tokenize
import re
import fasttext
from nltk.tokenize import word_tokenize

nltk.download('punkt')



# Load the FastText model
model = fasttext.load_model('cc.en.300.bin')


# Assuming the correct column name is 'text', replace 'tweet_text' with the actual name
def preprocess(text):
    # Tokenization
    tokens = text.split()

    # Case conversion
    tokens = [word.lower() for word in tokens]

    # Remove hashtags and usernames
    tokens = [re.sub(r'#\w+|@\w+', '', word) for word in tokens]

    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]

    return ' '.join(tokens)

# Load the pre-trained CNN model
loaded_model = load_model('/content/deepfake_model (1).h5')

# Load the label encoder
with open('/content/label_encoder (2).pkl', 'rb') as le_file:
    label_encoder = pickle.load(le_file)



# Load your new dataset (replace 'new_test.csv' with your actual dataset file)
new_csv_file_path = 'test.csv'
new_df = pd.read_csv(new_csv_file_path, delimiter=';')

# Preprocess the text in the 'text' column
new_df['preprocessed_text'] = new_df['text'].apply(preprocess)

# Tokenize the text
new_tokenized_text = [word_tokenize(text) for text in new_df['preprocessed_text']]

# Get word vectors for each token using the pre-trained FastText model
new_word_vectors = [model.get_word_vector(word) for tokens in new_tokenized_text for word in tokens]

# Convert word vectors to DataFrame
new_word_vectors_df = pd.DataFrame(new_word_vectors, columns=[f'feature_{i}' for i in range(300)])

# Concatenate the original DataFrame with the word vectors DataFrame
new_df_with_vectors = pd.concat([new_df, new_word_vectors_df], axis=1)

# Extract feature columns (assuming they start from column 'feature_0')
new_feature_columns = new_df_with_vectors.columns[new_df_with_vectors.columns.str.startswith('feature_')]

# Extract features and labels
new_X = new_df_with_vectors[new_feature_columns].values
new_y_actual = label_encoder.transform(new_df_with_vectors['class_type'])

# Reshape the input data to be compatible with Conv1D layer
new_X = new_X.reshape(new_X.shape[0], new_X.shape[1], 1)

# Predict on the new dataset
new_y_pred_probs = loaded_model.predict(new_X)
new_y_pred_classes = new_y_pred_probs.argmax(axis=-1)

# Calculate evaluation metrics for the new dataset
new_accuracy = accuracy_score(new_y_actual, new_y_pred_classes)
new_precision = precision_score(new_y_actual, new_y_pred_classes, average='weighted')
new_recall = recall_score(new_y_actual, new_y_pred_classes, average='weighted')
new_f1 = f1_score(new_y_actual, new_y_pred_classes, average='weighted')

# Print the evaluation metrics for the new dataset
print(f"New Dataset Evaluation Metrics:")
print(f"Accuracy: {new_accuracy:.4f}")
print(f"Precision: {new_precision:.4f}")
print(f"Recall: {new_recall:.4f}")
print(f"F1 Score: {new_f1:.4f}")

# Create a confusion matrix for the new dataset
new_conf_matrix = confusion_matrix(new_y_actual, new_y_pred_classes)

# Print the confusion matrix for the new dataset
print("Confusion Matrix for the New Dataset:")
print(new_conf_matrix)

# Create a classification report for the new dataset
new_class_report = classification_report(new_y_actual, new_y_pred_classes)

# Print the classification report for the new dataset\]

print("Classification Report for the New Dataset:")
print(new_class_report)


# # Print actual vs predicted output
# for actual, predicted in zip(new_y_actual, new_y_pred_classes):
#     print(f"Actual: {label_encoder.inverse_transform([actual])[0]}, Predicted: {label_encoder.inverse_transform([predicted])[0]}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


New Dataset Evaluation Metrics:
Accuracy: 0.9344
Precision: 0.8730
Recall: 0.9344
F1 Score: 0.9027
Confusion Matrix for the New Dataset:
[[    0     0     0     0   384]
 [    0     0     0     0  1278]
 [    0     0     0     0   484]
 [    0     0     0     0   412]
 [    0     0     0     0 36412]]
Classification Report for the New Dataset:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       384
           1       0.00      0.00      0.00      1278
           2       0.00      0.00      0.00       484
           3       0.00      0.00      0.00       412
           4       0.93      1.00      0.97     36412

    accuracy                           0.93     38970
   macro avg       0.19      0.20      0.19     38970
weighted avg       0.87      0.93      0.90     38970



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
