<a href="https://colab.research.google.com/github/Meenusj/Case_study/blob/main/cnn_with_allmini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m174.1/227.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
import pandas as pd

# Load the CSV file
csv_file_path = 'train.csv'
df = pd.read_csv(csv_file_path, sep=';')

# Convert 'account.type' to 0 and 1
df['account.type'] = df['account.type'].map({'human': 0, 'bot': 1})

# Filter for 1000 bots and 1000 humans
bots = df[df['account.type'] == 1].sample(1000, random_state=42)
humans = df[df['account.type'] == 0].sample(1000, random_state=42)

# Combine the samples
balanced_df = pd.concat([bots, humans])

# Shuffle the combined DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Count occurrences in the 'account.type' column
account_type_counts = balanced_df['account.type'].value_counts()
print("Account Type Counts:")
print(account_type_counts)

# Count occurrences in the 'class_type' column
class_type_counts = balanced_df['class_type'].value_counts()
print("\nClass Type Counts:")
print(class_type_counts)


Account Type Counts:
account.type
0    1000
1    1000
Name: count, dtype: int64

Class Type Counts:
class_type
human     1000
others     385
rnn        325
gpt2       290
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load MiniLM model
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

# Step 1: Read and Prepare Data

X = balanced_df['text'].values
y = balanced_df['account.type'].values

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Splitting data into train, validation, test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Get MiniLM Embeddings
def get_minilm_embeddings(model, texts):
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings.cpu().numpy()  # Convert to NumPy array and move to CPU

X_train_embeddings = get_minilm_embeddings(model, X_train)
X_val_embeddings = get_minilm_embeddings(model, X_val)
X_test_embeddings = get_minilm_embeddings(model, X_test)

# Step 3: Define and compile your model with MiniLM Embeddings
input_shape = X_train_embeddings.shape[1:]

model_cnn = tf.keras.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model_cnn.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Step 4: Training
history = model_cnn.fit(X_train_embeddings, y_train,
                        epochs=10,
                        batch_size=64,
                        validation_data=(X_val_embeddings, y_val),
                        callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)])

# Step 5: Evaluation
loss, accuracy = model_cnn.evaluate(X_test_embeddings, y_test)
print(f'Test accuracy: {accuracy}')

# Confusion Matrix
y_pred = np.argmax(model_cnn.predict(X_test_embeddings), axis=1)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)


# Print Actual vs Predicted Outputs
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("Actual vs Predicted Outputs:")
print(actual_vs_predicted.head(30))  # Display the first 30 predictions

# Save the model
model_cnn.save('text_classification_model_with_minilm_cnn.h5')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.7599999904632568
Confusion Matrix:
[[121  35]
 [ 37 107]]
Actual vs Predicted Outputs:
    Actual  Predicted
0        0          0
1        0          0
2        1          0
3        1          1
4        0          0
5        0          0
6        0          0
7        0          0
8        0          0
9        0          0
10       0          1
11       0          0
12       0          1
13       1          0
14       1          1
15       0          0
16       1          0
17       1          0
18       0          1
19       1          1
20       1          1
21       0          0
22       0          0
23       1          1
24       1          0
25       1          0
26       0          1
27       1          1
28       0          0
29       1          1


  saving_api.save_model(


In [None]:
import pandas as pd

# Load the CSV file
csv_file_path = 'train.csv'
df = pd.read_csv(csv_file_path, sep=';')

# Convert 'account.type' to 0 and 1
df['account.type'] = df['account.type'].map({'human': 0, 'bot': 1})

In [None]:

# Count occurrences in the 'account.type' column
account_type_counts = df['account.type'].value_counts()
print("Account Type Counts:")
print(account_type_counts)

Account Type Counts:
account.type
0    10358
1    10354
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load MiniLM model
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

# Step 1: Read and Prepare Data

X = df['text'].values
y = df['account.type'].values

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Splitting data into train, validation, test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Get MiniLM Embeddings
def get_minilm_embeddings(model, texts):
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings.cpu().numpy()  # Convert to NumPy array and move to CPU

X_train_embeddings = get_minilm_embeddings(model, X_train)
X_val_embeddings = get_minilm_embeddings(model, X_val)
X_test_embeddings = get_minilm_embeddings(model, X_test)

# Step 3: Define and compile your model with MiniLM Embeddings
input_shape = X_train_embeddings.shape[1:]

model_cnn = tf.keras.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model_cnn.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Step 4: Training
history = model_cnn.fit(X_train_embeddings, y_train,
                        epochs=10,
                        batch_size=64,
                        validation_data=(X_val_embeddings, y_val),
                        callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)])

# Step 5: Evaluation
loss, accuracy = model_cnn.evaluate(X_test_embeddings, y_test)
print(f'Test accuracy: {accuracy}')

# Confusion Matrix
y_pred = np.argmax(model_cnn.predict(X_test_embeddings), axis=1)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)


# Print Actual vs Predicted Outputs
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("Actual vs Predicted Outputs:")
print(actual_vs_predicted.head(30))  # Display the first 30 predictions

# Save the model
model_cnn.save('text_classification_model_with_minilm_cnn.h5')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.8278082013130188
Confusion Matrix:
[[1196  363]
 [ 172 1376]]
Actual vs Predicted Outputs:
    Actual  Predicted
0        1          1
1        1          1
2        1          0
3        0          0
4        0          0
5        1          0
6        0          0
7        0          1
8        1          1
9        1          1
10       1          0
11       0          0
12       1          1
13       0          0
14       1          1
15       1          1
16       1          1
17       0          0
18       1          0
19       1          1
20       0          1
21       1          1
22       1          1
23       1          0
24       0          0
25       1          1
26       0          1
27       1          1
28       1          1
29       1          1


  saving_api.save_model(


In [None]:
import pandas as pd

# Load the CSV file
csv_file_path = 'test.csv'
df_test = pd.read_csv(csv_file_path, sep=';')
df_test.head()

# Convert 'account.type' to 0 and 1
df_test['account.type'] = df_test['account.type'].map({'human': 0, 'bot': 1})

# Count occurrences in the 'account.type' column
account_type_counts = df['account.type'].value_counts()
print("Account Type Counts:")
print(account_type_counts)

Account Type Counts:
account.type
bot      1280
human    1278
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load MiniLM model
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

# Step 1: Read and Prepare Data

X = balanced_df['text'].values
y = balanced_df['account.type'].values

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Splitting data into train and validation sets only
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 2: Get MiniLM Embeddings
def get_minilm_embeddings(model, texts):
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings.cpu().numpy()  # Convert to NumPy array and move to CPU

X_train_embeddings = get_minilm_embeddings(model, X_train)
X_val_embeddings = get_minilm_embeddings(model, X_val)

# Step 3: Define and compile your model with MiniLM Embeddings
input_shape = X_train_embeddings.shape[1:]

model_cnn = tf.keras.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model_cnn.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Step 4: Training
history = model_cnn.fit(X_train_embeddings, y_train,
                        epochs=10,
                        batch_size=64,
                        validation_data=(X_val_embeddings, y_val),
                        callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)])

# Step 5: Load and Prepare Test Data
test_df = pd.read_csv('test.csv', sep=';')
test_df['account.type'] = test_df['account.type'].map({'human': 0, 'bot': 1})
X_test = test_df['text'].values
y_test = test_df['account.type'].values  # Use the values directly

X_test_embeddings = get_minilm_embeddings(model, X_test)

# Step 6: Evaluation
loss, accuracy = model_cnn.evaluate(X_test_embeddings, y_test)
print(f'Test accuracy: {accuracy}')

# Confusion Matrix
y_pred = np.argmax(model_cnn.predict(X_test_embeddings), axis=1)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

# Classification Report
report = classification_report(y_test, y_pred, target_names=['human', 'bot'])
print('Classification Report:')
print(report)

# Print Actual vs Predicted Outputs
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("Actual vs Predicted Outputs:")
print(actual_vs_predicted.head(30))  # Display the first 30 predictions

# Save the model
model_cnn.save('text_classification_model_with_minilm_cnn.h5')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.7615324258804321
Confusion Matrix:
[[ 939  339]
 [ 271 1009]]
Classification Report:
              precision    recall  f1-score   support

       human       0.78      0.73      0.75      1278
         bot       0.75      0.79      0.77      1280

    accuracy                           0.76      2558
   macro avg       0.76      0.76      0.76      2558
weighted avg       0.76      0.76      0.76      2558

Actual vs Predicted Outputs:
    Actual  Predicted
0        0          1
1        0          0
2        0          0
3        1          1
4        0          0
5        1          1
6        1          1
7        0          1
8        0          0
9        0          0
10       0          0
11       1          1
12       0          0
13       1          1
14       0          1
15       0          0
16       0          1
17       1          1
18       0   

  saving_api.save_model(
