In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Load training and testing datasets
train_data = pd.read_csv('train(1).csv')
test_data = pd.read_csv('test.csv')

# Print the columns to verify
print("Training Data Columns:", train_data.columns)
print("Testing Data Columns:", test_data.columns)

# Preprocess data function
def preprocess_data(data):
    # Feature engineering based on provided column names
    data['username_length'] = data['nums/length username']
    data['fullname_length'] = data['nums/length fullname']
    data['name_equals_username'] = data['name==username']
    data['description_length'] = data['description length']
    data['external_url_present'] = data['external URL'].apply(lambda x: 1 if pd.notnull(x) else 0)
    
    # Select the relevant features
    feature_columns = ['profile pic', 'username_length', 'fullname words', 'fullname_length', 
                       'name_equals_username', 'description_length', 'external_url_present', 
                       'private', '#posts', '#followers', '#follows']
    return data[feature_columns]

# Preprocess training and testing data
X_train = preprocess_data(train_data)
y_train = train_data['fake']
X_test = preprocess_data(test_data)
y_test = test_data['fake']

# Model training
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Model evaluation
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Save the trained model
joblib.dump(rf_model, 'best_rf_model.pkl')


Training Data Columns: Index(['profile pic', 'nums/length username', 'fullname words',
       'nums/length fullname', 'name==username', 'description length',
       'external URL', 'private', '#posts', '#followers', '#follows', 'fake'],
      dtype='object')
Testing Data Columns: Index(['profile pic', 'nums/length username', 'fullname words',
       'nums/length fullname', 'name==username', 'description length',
       'external URL', 'private', '#posts', '#followers', '#follows', 'fake'],
      dtype='object')
Accuracy: 0.9166666666666666
Precision: 0.9166666666666666
Recall: 0.9166666666666666
F1 Score: 0.9166666666666666


['best_rf_model.pkl']