In [15]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [16]:
data = pd.read_csv("combined_accs.csv")

data.head(10)



Unnamed: 0,userFollowerCount,userFollowingCount,userBiographyLength,userMediaCount,userHasProfilPic,userIsPrivate,usernameDigitCount,usernameLength,isFake
0,25,1937,0.0,0,1,1,0.0,10,1.0
1,324,4122,0.0,0,1,0,4.0,15,1.0
2,15,399,0.0,0,0,0,3.0,12,1.0
3,14,107,0.0,1,1,0,1.0,10,1.0
4,264,4651,0.0,0,1,0,0.0,14,1.0
5,33,1470,0.0,2,1,1,4.0,13,1.0
6,420,4883,30.0,8,1,0,0.0,12,1.0
7,442,6662,0.0,396,1,0,0.0,11,1.0
8,816,7497,0.0,85,1,0,3.0,15,1.0
9,150,6631,1.0,0,1,1,3.0,8,1.0


In [17]:
rows, cols = data.shape

data= data.drop(columns="usernameDigitCount",axis=1)
print(f"Number of rows: {rows}")
print(f"Number of columns: {cols}")

Number of rows: 1294
Number of columns: 9


In [18]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1294 entries, 0 to 1293
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   userFollowerCount    1294 non-null   int64  
 1   userFollowingCount   1294 non-null   int64  
 2   userBiographyLength  1294 non-null   float64
 3   userMediaCount       1294 non-null   int64  
 4   userHasProfilPic     1294 non-null   int64  
 5   userIsPrivate        1294 non-null   int64  
 6   usernameLength       1294 non-null   int64  
 7   isFake               1294 non-null   float64
dtypes: float64(2), int64(6)
memory usage: 81.0 KB


In [19]:
null_counts = data.isnull().sum()

print("Number of null values in each column:")
print(null_counts)

# If you want the total number of null values in the entire dataset:
total_nulls = data.isnull().sum().sum()
print(f"\nTotal number of null values in the dataset: {total_nulls}")

Number of null values in each column:
userFollowerCount      0
userFollowingCount     0
userBiographyLength    0
userMediaCount         0
userHasProfilPic       0
userIsPrivate          0
usernameLength         0
isFake                 0
dtype: int64

Total number of null values in the dataset: 0


In [20]:
data["isFake"] = data["isFake"].astype(int)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1294 entries, 0 to 1293
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   userFollowerCount    1294 non-null   int64  
 1   userFollowingCount   1294 non-null   int64  
 2   userBiographyLength  1294 non-null   float64
 3   userMediaCount       1294 non-null   int64  
 4   userHasProfilPic     1294 non-null   int64  
 5   userIsPrivate        1294 non-null   int64  
 6   usernameLength       1294 non-null   int64  
 7   isFake               1294 non-null   int64  
dtypes: float64(1), int64(7)
memory usage: 81.0 KB


In [21]:
from sklearn.utils import shuffle
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Step 1: Shuffle the dataset to remove bias
final_data = shuffle(data, random_state=42).reset_index(drop=True)

X = final_data.drop(columns=['isFake'])  
y = final_data['isFake']  

# Step 3: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=1, stratify=y
)

# Step 4: Apply Sequential Feature Selection (SFS)
clf = RandomForestClassifier(n_estimators=100, random_state=1)

sfs = SFS(clf, 
          k_features=6,
          forward=True,  
          scoring="accuracy",  
          cv=5)  

sfs.fit(X_train, y_train)

# Step 5: Get selected feature names
columnlist = list(X_train.columns)
feature_cols = list(sfs.k_feature_idx_)
subset = [columnlist[i] for i in feature_cols]

print("Selected Features:", subset)



Selected Features: ['userFollowerCount', 'userFollowingCount', 'userMediaCount', 'userHasProfilPic', 'userIsPrivate', 'usernameLength']


Selected Features: ['userFollowerCount', 'userFollowingCount', 'userMediaCount', 'userHasProfilPic', 'userIsPrivate', 'usernameLength']

In [22]:
mpg_df=final_data[subset].copy()
mpg_df = pd.concat([mpg_df, final_data['isFake']], axis=1)
mpg_df

Unnamed: 0,userFollowerCount,userFollowingCount,userMediaCount,userHasProfilPic,userIsPrivate,usernameLength,isFake
0,667,660,20,1,1,7,0
1,204,445,92,1,1,8,0
2,576,680,187,1,1,11,0
3,319,328,6,1,1,11,0
4,15,305,29,0,1,18,1
...,...,...,...,...,...,...,...
1289,43,30,15,1,1,6,0
1290,79,90,0,1,1,7,0
1291,581,544,58,1,1,11,0
1292,472,534,27,1,1,7,0


In [23]:
X = mpg_df.drop(columns=['isFake'])  
y = mpg_df['isFake']  

# Step 3: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=1, stratify=y
)

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Initialize Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)

# Define parameter grid for tuning
param_grid = {
    'max_depth': [None, 5, 10],              
    'min_samples_split': [2, 5],              
    'min_samples_leaf': [1, 2]                  
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(
    estimator=dt, 
    param_grid=param_grid, 
    scoring='accuracy',   # Use classification metric
    cv=5,                
    verbose=1           
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Get best parameters and best model
best_params = grid_search.best_params_
best_score = grid_search.best_score_  # Best cross-validated accuracy
best_model = grid_search.best_estimator_

# Make predictions on test data
y_pred = best_model.predict(X_test)

# Classification Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print results
print(f"Best Parameters: {best_params}")
print(f"Cross-validated Accuracy: {best_score:.4f}")
print(f"Test Set Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Cross-validated Accuracy: 0.9547
Test Set Accuracy: 0.9537
Precision: 0.9500
Recall: 0.8444
F1 Score: 0.8941
ROC AUC Score: 0.9155


In [25]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Convert target labels to integers (fixes the error)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Initialize XGBoost Classifier
xgb = XGBClassifier(random_state=42, eval_metric="logloss")

# Define parameter grid for tuning
param_grid = {
    'max_depth': [3, 5, 7],                  
    'learning_rate': [0.01, 0.1, 0.2],        
    'n_estimators': [50, 100, 150],           
    'subsample': [0.8, 1.0],                  
    'colsample_bytree': [0.8, 1.0]            
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(
    estimator=xgb, 
    param_grid=param_grid, 
    scoring='accuracy',   # Use classification metric
    cv=5,                
    verbose=1,
    n_jobs=-1           
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Get best parameters and best model
best_params = grid_search.best_params_
best_score = grid_search.best_score_  # Best cross-validated accuracy
best_model = grid_search.best_estimator_

# Make predictions on training and test data
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Training and Test Accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Other Classification Metrics
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_pred)

# Print results
print(f"Best Parameters: {best_params}")
print(f"Cross-validated Accuracy: {best_score:.4f}")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Set Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")



Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Cross-validated Accuracy: 0.9713
Training Accuracy: 0.9923
Test Set Accuracy: 0.9589
Precision: 0.9205
Recall: 0.9000
F1 Score: 0.9101
ROC AUC Score: 0.9383


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
Cross-validated Accuracy: 0.9617
Training Accuracy: 0.9784
Test Set Accuracy: 0.9805

In [26]:
def analyze_instagram_account(model, follower_count, following_count, media_count, 
                            has_profile_pic, is_private, username_length, verbose=True):
    """
    Analyze an Instagram account with detailed explanation of risk factors.
    """
    import pandas as pd
    import numpy as np
    
    # Create sample data
    sample = pd.DataFrame({
        'userFollowerCount': [follower_count],
        'userFollowingCount': [following_count],
        'userMediaCount': [media_count],
        'userHasProfilPic': [has_profile_pic],
        'userIsPrivate': [is_private],
        'usernameLength': [username_length]
    })
    
    # Make prediction
    prediction = model.predict(sample)[0]
    probability = model.predict_proba(sample)[0]
    fake_probability = probability[1]
    
    # Analyze risk factors
    risk_factors = []
    
    # Following to follower ratio
    if following_count / max(follower_count, 1) > 10:
        risk_factors.append(f"High following/follower ratio: {following_count}/{follower_count}")
    
    # Low follower count
    if follower_count < 50:
        risk_factors.append(f"Very low follower count: {follower_count}")
    
    # No profile picture
    if not has_profile_pic:
        risk_factors.append("No profile picture")
    
    # Unusual username length
    if username_length > 15:
        risk_factors.append(f"Unusually long username: {username_length} characters")
    
    # Unusual media to follower ratio
    if media_count > follower_count:
        risk_factors.append(f"High media count ({media_count}) compared to followers ({follower_count})")
    
    if verbose:
        print("\nAccount Analysis Results:")
        print("-" * 50)
        print(f"Prediction: {'FAKE' if prediction == 1 else 'REAL'}")
        print(f"Confidence: {max(probability) * 100:.2f}%")
        print(f"Probability of being fake: {fake_probability * 100:.2f}%")
        print("\nAccount Features:")
        print(f"- Followers: {follower_count}")
        print(f"- Following: {following_count}")
        print(f"- Media Count: {media_count}")
        print(f"- Has Profile Picture: {'Yes' if has_profile_pic else 'No'}")
        print(f"- Private Account: {'Yes' if is_private else 'No'}")
        print(f"- Username Length: {username_length}")
        
        if risk_factors:
            print("\nPotential Risk Factors:")
            for factor in risk_factors:
                print(f"- {factor}")
        else:
            print("\nNo significant risk factors identified")
    
    return prediction, fake_probability, risk_factors

# Test with the specific case
prediction, prob, risks = analyze_instagram_account(
    best_model,
    follower_count=200,
    following_count=100,
    media_count=30,
    has_profile_pic=0,
    is_private=0,
    username_length=20,
)



Account Analysis Results:
--------------------------------------------------
Prediction: REAL
Confidence: 94.65%
Probability of being fake: 5.35%

Account Features:
- Followers: 200
- Following: 100
- Media Count: 30
- Has Profile Picture: No
- Private Account: No
- Username Length: 20

Potential Risk Factors:
- No profile picture
- Unusually long username: 20 characters


In [27]:
import joblib

model_filename = "fake_account_model.pkl"
joblib.dump(best_model, model_filename)  # Replace 'model' with your trained model variable

['fake_account_model.pkl']

In [28]:
# import pandas as pd
# import numpy as np
# import random

# def generate_fake_accounts(n_rows=50):
#     """
#     Generate fake Instagram account data with specific characteristics
#     """
#     # Initialize lists for each column
#     followers = np.random.randint(0, 21, n_rows)  # 0-20 followers
#     following = np.random.randint(300, 1001, n_rows)  # 300-1000 following
#     bio_length = np.zeros(n_rows)  # Usually 0 for fake accounts
#     media_count = np.random.randint(15, 31, n_rows)  # 15-30 media count
    
#     # Generate binary features with higher probability of specific values
#     has_profile_pic = np.random.choice([0, 1], n_rows, p=[0.7, 0.3])  # More likely to have no profile pic
#     is_private = np.random.choice([0, 1], n_rows, p=[0.6, 0.4])  # More likely to be public
    
#     # Generate usernames of varying lengths (8-20 characters)
#     username_length = np.random.randint(8, 21, n_rows)
    
#     # All accounts are fake in this generation
#     is_fake = np.ones(n_rows)
    
#     # Create DataFrame
#     df = pd.DataFrame({
#         'userFollowerCount': followers,
#         'userFollowingCount': following,
#         'userBiographyLength': bio_length,
#         'userMediaCount': media_count,
#         'userHasProfilPic': has_profile_pic,
#         'userIsPrivate': is_private,
#         'usernameLength': username_length,
#         'isFake': is_fake
#     })
    
#     return df

# # Generate the fake accounts
# fake_accounts = generate_fake_accounts(50)

# # Display first few rows to verify
# print("Sample of generated fake accounts:")
# print(fake_accounts.head())

# # Basic statistics of generated data
# print("\nStatistics of generated data:")
# print(fake_accounts.describe())
