In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from datetime import datetime
import time
import pickle

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/SocialMediaUsersDataset.csv')

In [4]:
df.fillna("", inplace=True)
df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')
df['Age'] = datetime.now().year - df['DOB'].dt.year
df['Age'] = df['Age'].fillna(0).astype(int)
df['InterestCount'] = df['Interests'].apply(lambda x: len(set(x.replace("'", "").split(", "))))
encoder = LabelEncoder()
df['CityEncoded'] = encoder.fit_transform(df['City'])
df['CountryEncoded'] = encoder.fit_transform(df['Country'])

In [5]:
G = nx.Graph()
for idx, row in df.iterrows():
    interests = row['Interests'].replace("'", "").split(", ")
    for interest in interests:
        G.add_edge(row['UserID'], interest)

sampled_nodes = list(G.nodes)
G = G.subgraph(sampled_nodes)
print(f"Graph reduced to {len(G.nodes)} nodes and {len(G.edges)} edges.")

Graph reduced to 100029 nodes and 285631 edges.


In [6]:
start_time = time.time()
pagerank = nx.pagerank(G)
print(f"PageRank calculated in {time.time() - start_time:.2f} seconds.")
df['PageRank'] = df['UserID'].apply(lambda x: pagerank.get(x, 0))
# Normalize data set features
scaler = MinMaxScaler()
df[['PageRank', 'InterestCount', 'Age']] = scaler.fit_transform(df[['PageRank', 'InterestCount', 'Age']])
print("Features normalized.")

PageRank calculated in 2.77 seconds.
Features normalized.


In [7]:
X = df[['PageRank', 'InterestCount', 'Age', 'CityEncoded', 'CountryEncoded']].values
y = (df['PageRank'] > 0.5).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
}


In [9]:
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    results[name] = {
        "F1 Score": f1,
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
    }
    print(f"{name} - F1 Score: {f1:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")
    print(classification_report(y_test, y_pred))

Training RandomForest...
RandomForest - F1 Score: 1.0000
Train Accuracy: 1.0000, Test Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12390
           1       1.00      1.00      1.00      7610

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



XGBoost - F1 Score: 1.0000
Train Accuracy: 1.0000, Test Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12390
           1       1.00      1.00      1.00      7610

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Training LightGBM...
[LightGBM] [Info] Number of positive: 30142, number of negative: 49858
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 789
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376775 -> initscore=-0.503259
[LightGBM] [Info] Start training from score -0.503259








LightGBM - F1 Score: 0.9976
Train Accuracy: 0.9997, Test Accuracy: 0.9981
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12390
           1       1.00      1.00      1.00      7610

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000





In [10]:
best_model_name = max(results, key=lambda x: results[x]["F1 Score"])
best_model = models[best_model_name]
print(f"Best Model: {best_model_name} with F1 Score: {results[best_model_name]['F1 Score']:.4f}")
print(f"Train Accuracy: {results[best_model_name]['Train Accuracy']:.4f}")
print(f"Test Accuracy: {results[best_model_name]['Test Accuracy']:.4f}")

Best Model: RandomForest with F1 Score: 1.0000
Train Accuracy: 1.0000
Test Accuracy: 1.0000


In [11]:
print("Saving the best model")
with open(f"best_friend_recommendation_model_{best_model_name}.pkl", "wb") as f:
    pickle.dump(best_model, f)
print(f"Model saved as best_friend_recommendation_model_{best_model_name}.pkl")

Saving the best model
Model saved as best_friend_recommendation_model_RandomForest.pkl


In [12]:
def recommend_friends(user_index, top_n=3):
    user = df.iloc[user_index]
    user_pagerank = user['PageRank']
    user_country = user['Country']
    similar_users = df[
        (df['PageRank'] > user_pagerank)
        & (df['CountryEncoded'] == user['CountryEncoded'])
    ].sort_values('PageRank', ascending=False)[:top_n]

    recommendations = similar_users[['UserID', 'Name', 'Gender', 'DOB', 'Interests', 'City', 'Country']]

    print(f"Recommendations for User {user['Name']} (UserID: {user['UserID']}):")
    for _, row in recommendations.iterrows():
        print(f"UserID {row['UserID']} ({row['Name']})")

    return recommendations


In [13]:
print("Friends recommendations on Meta Platforms")
recommendations = recommend_friends(0)
print(recommendations)

Friends recommendations on Meta Platforms
Recommendations for User Jesse Lawhorn (UserID: 1):
UserID 32047 (Joseph Harju)
UserID 17277 (Juan Johnson)
UserID 24650 (Edith Weir)
       UserID          Name  Gender        DOB  \
32046   32047  Joseph Harju  Female 1965-01-10   
17276   17277  Juan Johnson    Male 1961-06-18   
24649   24650    Edith Weir  Female 1962-08-14   

                                               Interests          City  \
32046  'Gardening', 'Travel', 'Fashion', 'Finance and...    Purwokerto   
17276  'Fashion', 'Technology', 'Music', 'Finance and...  Palangkaraya   
24649  'Music', 'Social causes and activism', 'Cookin...       Welahan   

         Country  
32046  Indonesia  
17276  Indonesia  
24649  Indonesia  
