In [13]:
pip install numpy pandas scikit-learn




In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve


In [15]:
# Replace 'filename.csv' with the actual path to your CSV file
file_path = 'instagram.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify that the data was loaded correctly
print(data.head())


   profile pic  nums/length username  fullname words  nums/length fullname  \
0            1                  0.27               0                   0.0   
1            1                  0.00               2                   0.0   
2            1                  0.10               2                   0.0   
3            1                  0.00               1                   0.0   
4            1                  0.00               2                   0.0   

   name==username  description length  external URL  private  #posts  \
0               0                  53             0        0      32   
1               0                  44             0        0     286   
2               0                   0             0        1      13   
3               0                  82             0        0     679   
4               0                   0             0        1       6   

   #followers  #follows  fake  
0        1000       955     0  
1        2740       533     0  
2 

In [16]:
#addressing the outliers
features_with_outliers = ['#posts', '#followers']

for feature in features_with_outliers:
    data[feature] = np.log1p(data[feature])

In [17]:
import pandas as pd

# Assuming 'data' is your DataFrame containing the dataset
# Define features (X) and target variable (y)
X = data.drop('fake', axis=1)  # Drop the column containing the target variable
y = data['fake']  # Select only the column containing the target variable

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training features
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing features using the scaler fitted on the training data
X_test_scaled = scaler.transform(X_test)

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model on the scaled training data
model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred = model.predict(X_test_scaled)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Model Evaluation:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Model Evaluation:
Accuracy: 0.9310344827586207
Precision: 0.9591836734693877
Recall: 0.8867924528301887
F1 Score: 0.9215686274509803


In [19]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Define class weights to address class imbalance
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Initialize Logistic Regression model
model = LogisticRegression()

# Define hyperparameters for grid search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2']  # Regularization penalty
}

# Perform grid search with 5-fold stratified cross-validation
grid_search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=5), scoring='f1', verbose=1)

# Fit grid search to the scaled training data
grid_search.fit(X_train_scaled, y_train)

# Get best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate best model on the testing data
y_pred = best_model.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Best Model Evaluation:")
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nConfusion Matrix:")
print(conf_matrix)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Model Evaluation:
Best Hyperparameters: {'C': 0.1, 'penalty': 'l2'}
Accuracy: 0.9137931034482759
Precision: 0.9574468085106383
Recall: 0.8490566037735849
F1 Score: 0.9

Confusion Matrix:
[[61  2]
 [ 8 45]]


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.93346219        nan 0.92

In [20]:
pip install skl2onnx



In [21]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Convert the model to ONNX format
onnx_model = convert_sklearn(best_model, initial_types=[('input', FloatTensorType([None, X_train_scaled.shape[1]]))])

# Save the ONNX model to a file
with open('model.onnx', 'wb') as f:
    f.write(onnx_model.SerializeToString())


In [22]:
pip install onnxruntime




In [23]:
import numpy as np
import onnxruntime as rt

# Load the ONNX model
sess = rt.InferenceSession("model.onnx")

# Define the attribute names
attribute_names = ["profile pic(0 or 1)", "fullname words", "name==username(0 or 1)", "description length", "external URL(0 or 1)", "private(0 or 1)", "#posts", "#followers", "#follows", "nums/length fullname", "nums/length username"]

# Function to preprocess user input
def preprocess_input(user_input):
    # Preprocess the user input (e.g., convert to numpy array)
    # Make sure to preprocess the input in the same way as the training data
    user_input_processed = np.array(user_input)  # Assuming user_input is a list or array
    return user_input_processed.astype(np.float32)

# Function to scale input data
def scale_input(input_data):
    # Scale the input data using the same scaler used during training
    input_data_scaled = scaler.transform(input_data.reshape(1, -1))  # Scale the input
    return input_data_scaled

# Function to make predictions
def predict(input_data):
    input_name = sess.get_inputs()[0].name
    output_name = sess.get_outputs()[0].name
    predictions = sess.run([output_name], {input_name: input_data})[0]
    return predictions

# Main function to take user input and make predictions
def main():
    # Take input for each attribute
    user_input = []
    for attribute_name in attribute_names:
        if attribute_name == "nums/length fullname":
            nums_fullname = float(input("Enter the number of numeric characters in the fullname: "))
            len_fullname = float(input("Enter the length of the fullname: "))
            nums_len_fullname = nums_fullname / len_fullname
            user_input.append(nums_len_fullname)
        elif attribute_name == "nums/length username":
            nums_username = float(input("Enter the number of numeric characters in the username: "))
            len_username = float(input("Enter the length of the username: "))
            nums_len_username = nums_username / len_username
            user_input.append(nums_len_username)
        elif attribute_name == "#posts" or attribute_name == "#followers" or attribute_name == "#follows":
            attribute_value = int(input(f"Enter value for {attribute_name}: "))
            user_input.append(attribute_value)
        else:
            attribute_value = input(f"Enter value for {attribute_name}: ")
            user_input.append(float(attribute_value))

    # Preprocess user input
    input_data = preprocess_input(user_input)

    # Scale input data
    input_data_scaled = scale_input(input_data)

    # Make predictions
    predictions = predict(input_data_scaled)

    # Display predictions
    predicted_class = "fake" if predictions[0] > 0.5 else "not fake"
    print("Predicted class:", predicted_class)

if __name__ == "__main__":
    main()


Enter value for profile pic(0 or 1): 0
Enter value for fullname words: 7
Enter value for name==username(0 or 1): 0
Enter value for description length: 34
Enter value for external URL(0 or 1): 0
Enter value for private(0 or 1): 1
Enter value for #posts: 20
Enter value for #followers: 437
Enter value for #follows: 456
Enter the number of numeric characters in the fullname: 0
Enter the length of the fullname: 7
Enter the number of numeric characters in the username: 4
Enter the length of the username: 11
Predicted class: not fake




In [None]:
The data set we trained is small and it may change the output for some cases
