In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import ast
import networkx
import numpy as np
import warnings
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from config import CSV_PATH_FRIENDS, CSV_PATH_TEST, CSV_PATH_TRAIN, CSV_PATH_POSTS, CSV_PATH_PROFILES
from config import CSV_PATH_SAVE1, CSV_PATH_SAVE2

warnings.filterwarnings('ignore')

### Load data

In [2]:
df_profiles = pd.read_csv(CSV_PATH_PROFILES)
df_posts = pd.read_csv(CSV_PATH_POSTS)
df_friends = pd.read_csv(CSV_PATH_FRIENDS)
df_train = pd.read_csv(CSV_PATH_TRAIN)
df_test = pd.read_csv(CSV_PATH_TEST)

print(df_profiles.shape)
print(df_posts.shape)
print(df_friends.shape)
print(df_train.shape)
print(df_test.shape)

(4904, 11)
(43979, 7)
(953179, 2)
(6356754, 3)
(706407, 3)


### Prepare data

In [3]:
friends_net = networkx.Graph()
friends_net.add_nodes_from(df_friends['user1'])
friends_net.add_nodes_from(df_friends['user2'])
friends_net.add_edges_from(df_friends.values)

In [4]:
def profile_closed_cnt(first_user, second_user):
    cnt = 0
    if first_user[0]:
        cnt += 1

    if second_user[0]:
        cnt += 1
        
    return cnt

def posts_cnt(first_user, second_user):
    return [first_user[1], second_user[2]]

def is_same_city(first_user, second_user):
    return first_user[2] == second_user[2]

def is_same_sex(first_user, second_user):
    return first_user[3] == second_user[3]

def followers_cnt(first_user, second_user):
    cnt = [first_user[4], second_user[4]]
    if np.isnan(cnt[0]):
        cnt[0] = friends_cnt(first_user, second_user)[0]

    if np.isnan(cnt[1]):
        cnt[1] = friends_cnt(first_user, second_user)[1]

    return cnt

def friends_cnt(first_user, second_user):
    cnt = [first_user[5], second_user[5]]

    if np.isnan(cnt[0]):
        cnt[0] = 0

    if np.isnan(cnt[1]):
        cnt[1] = 0

    return cnt

def most_friends_sex(first_user, second_user):
    if friends_cnt(first_user, second_user)[0] > friends_cnt(first_user, second_user)[1]:
        return first_user[3]
    else:
        return second_user[3]

def same_school_cnt(first_user, second_user):
    schools_first = first_user[6]
    schools_second = second_user[6]
    
    if isinstance(schools_first, float) or isinstance(schools_second, float): 
        return 0
    
    schools_first_list = ast.literal_eval(schools_first)
    schools_second_list = ast.literal_eval(schools_second)

    return len(set(schools_first_list).intersection(set(schools_second_list)))

def same_university_cnt(first_user, second_user):
    universities_first = first_user[7]
    universities_second = second_user[7]

    if isinstance(universities_first, float) or isinstance(universities_second, float): 
        return []
    
    universities_first_list = ast.literal_eval(universities_first)
    universities_second_list = ast.literal_eval(universities_second)
    
    return set(universities_first_list).intersection(set(universities_second_list))

def was_on_same_faculty(first_user, second_user):
    universities = same_university_cnt(first_user, second_user)

    universities_first = first_user[7]
    universities_second = second_user[7]

    faculties_first = first_user[8]
    faculties_second = second_user[8]

    if isinstance(faculties_first, float) or isinstance(faculties_second, float): 
        return 0

    if isinstance(universities_first, float) or isinstance(universities_second, float): 
            return 0
        
    faculties_first_list = ast.literal_eval(faculties_first)
    faculties_second_list = ast.literal_eval(faculties_second)
    universities_first_list = ast.literal_eval(universities_first)
    universities_second_list = ast.literal_eval(universities_second)
    
    for i in universities:
        if faculties_first_list[universities_first_list.index(i)] == faculties_second_list[universities_second_list.index(i)]:
            if faculties_first_list[universities_first_list.index(i)] != 'None':
                return 1

    return 0

def common_friends_cnt(user1, user2):
    return len(set(friends_net.neighbors(user1)).intersection(set(friends_net.neighbors(user2))))


def approximate_age_difference(first_user, second_user):
    years_first = first_user[9]
    years_second = second_user[9]
    
    if isinstance(years_first, float) or isinstance(years_second, float): 
        return -1
    
    years_first_list = ast.literal_eval(years_first)
    years_second_list = ast.literal_eval(years_second)
        
    years_first_list = list(filter(None, years_first_list))
    years_second_list = list(filter(None, years_second_list))

    if len(years_first_list) == 0 or len(years_second_list) == 0:
        return -1

    first_max = max(years_first_list)
    second_max = max(years_second_list)

    return abs(first_max - second_max)


def column_names():
    names = [
        'is_friends',
        'profile_closed_cnt',
        'biggest_posts_cnt',
        'smallest_posts_cnt',
        'is_same_city',
        'is_same_sex',
        'biggest_followers_cnt',
        'smallest_followers_cnt',
        'biggest_friends_cnt',
        'smallest_friends_cnt',
        'common_friends_cnt',
        'most_friends_sex',
        'same_school_cnt',
        'same_university_cnt',
        'was_on_same_faculty'
        ]
    return names

def take_all_user_info(user: int):
    ser_prof = df_profiles[df_profiles['id'] == user].iloc[0]
    user_info = [
        ser_prof['is_closed'],
        df_posts[df_posts['author_id'] == user].shape[0],
        ser_prof['city'],
        ser_prof['sex'],
        ser_prof['followers_count'],
        df_friends[df_friends['user1'] == user].shape[0] + df_friends[df_friends['user2'] == user].shape[0],
        ser_prof['schools'],
        ser_prof['universities'],
        ser_prof['faculties'],
        ser_prof['schools_years_grad']
    ]
    return user_info

def add_features(df: pd.DataFrame):
    records = [0] * df.shape[0]
    for i in tqdm(range(df.shape[0])):
        user1 = df['user1'].iloc[i]
        user2 = df['user2'].iloc[i]
        is_friends = df['is_friends'].iloc[i]
        user_1_info = take_all_user_info(user1)
        user_2_info = take_all_user_info(user2)
        records[i] = ([
            is_friends,
            profile_closed_cnt(user_1_info, user_2_info),
            max(posts_cnt(user_1_info, user_2_info)),
            min(posts_cnt(user_1_info, user_2_info)),
            is_same_city(user_1_info, user_2_info),
            is_same_sex(user_1_info, user_2_info),
            max(followers_cnt(user_1_info, user_2_info)),
            min(followers_cnt(user_1_info, user_2_info)),
            max(friends_cnt(user_1_info, user_2_info)),
            min(friends_cnt(user_1_info, user_2_info)),
            common_friends_cnt(user1, user2),
            most_friends_sex(user_1_info, user_2_info),
            same_school_cnt(user_1_info, user_2_info),
            len(same_university_cnt(user_1_info, user_2_info)),
            was_on_same_faculty(user_1_info, user_2_info)
        ])
        
    return pd.DataFrame(records, columns=column_names())


In [5]:
df_true = df_train[df_train['is_friends'] == 1]
df_false = df_train[df_train['is_friends'] == 0]
true_len = df_true.shape[0]
print(true_len)
df_false_small = df_false.sample(n = 4*true_len, replace = False)
df_featured = pd.concat([df_true, df_false_small])
df_featured = df_featured.sort_index(ascending=True)
df_featured = add_features(df_featured)

19896


100%|██████████| 59688/59688 [06:43<00:00, 147.83it/s]


In [6]:
corr = df_featured.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,is_friends,profile_closed_cnt,smallest_posts_cnt,is_same_city,is_same_sex,biggest_friends_cnt,smallest_friends_cnt,common_friends_cnt,most_friends_sex,same_school_cnt,same_university_cnt,was_on_same_faculty
is_friends,1.0,0.051948,-0.062067,0.121725,0.071549,0.165047,0.383782,0.571675,0.078258,0.073527,0.220587,0.182486
profile_closed_cnt,0.051948,1.0,-0.007347,-0.003037,0.018894,0.028161,0.048674,0.039343,0.041039,0.007191,0.032788,0.054298
smallest_posts_cnt,-0.062067,-0.007347,1.0,-0.448269,-0.02355,0.047477,-0.028608,-0.045767,-0.041627,-0.008704,-0.05841,-0.040447
is_same_city,0.121725,-0.003037,-0.448269,1.0,-0.007984,0.071081,0.152396,0.121149,0.001831,0.022893,0.100974,0.059165
is_same_sex,0.071549,0.018894,-0.02355,-0.007984,1.0,-0.013981,-0.005585,0.028877,0.344399,0.001171,0.031664,0.031086
biggest_friends_cnt,0.165047,0.028161,0.047477,0.071081,-0.013981,1.0,0.40547,0.277313,0.025513,0.00462,0.046226,0.00299
smallest_friends_cnt,0.383782,0.048674,-0.028608,0.152396,-0.005585,0.40547,1.0,0.529185,0.007261,0.031782,0.131179,0.069614
common_friends_cnt,0.571675,0.039343,-0.045767,0.121149,0.028877,0.277313,0.529185,1.0,0.050632,0.082182,0.163915,0.129012
most_friends_sex,0.078258,0.041039,-0.041627,0.001831,0.344399,0.025513,0.007261,0.050632,1.0,0.006471,0.049493,0.04678
same_school_cnt,0.073527,0.007191,-0.008704,0.022893,0.001171,0.00462,0.031782,0.082182,0.006471,1.0,0.085636,0.067227


### Learn model

In [7]:
def separate_train_test(data):
    return np.split(data, [int(.8*len(data))])

def separate_x_and_y(data):
    x = data.drop('is_friends', axis=1)
    y = data['is_friends']
    return x, y

def separate_train_val_dec(data):
    data1, data2 = np.split(data, [int(((10 - 1) / 10) * len(data))])
    return data1, data2

def test_column_names():
    names = [
        'profile_closed_cnt',
        'biggest_posts_cnt',
        'smallest_posts_cnt',
        'is_same_city',
        'is_same_sex',
        'biggest_followers_cnt',
        'smallest_followers_cnt',
        'biggest_friends_cnt',
        'smallest_friends_cnt',
        'common_friends_cnt',
        'most_friends_sex',
        'same_school_cnt',
        'same_university_cnt',
        'was_on_same_faculty'
        ]
    return names

train_data, test_data = separate_train_test(df_featured.copy())
print("Train data len:", len(train_data))
print("Test data len:", len(test_data))

Train data len: 47750
Test data len: 11938


In [8]:
lib_acc_result = []

train_k_data, val_data = separate_train_val_dec(train_data)
x_train, y_train = separate_x_and_y(train_k_data)
x_val, y_val = separate_x_and_y(val_data)
x_train = pd.DataFrame(preprocessing.normalize(x_train.copy()), columns=test_column_names())
x_val = pd.DataFrame(preprocessing.normalize(x_val.copy()), columns=test_column_names())

clf2 = RandomForestClassifier(n_estimators=400, max_depth=15)
clf3 = RandomForestClassifier(n_estimators=200, max_depth=20)
clf4 = KNeighborsClassifier(n_neighbors=21)

svc = VotingClassifier(estimators=[('rf', clf2), ('rf2', clf3), ('knn', clf4)], voting='soft')

svc_model = svc.fit(x_train, y_train)
svc_predictions = svc.predict(x_val)
lib_acc_result.append([0, accuracy_score(y_val, svc_predictions)])

lib_acc_result.sort(key=lambda x: x[1], reverse=True)
print("Best trained lib model [k, val accuracy]:", lib_acc_result[0])

Best trained lib model [k, val accuracy]: [0, 0.951413612565445]


### Predict

In [9]:
def add_test_features(df: pd.DataFrame):
    records = [0] * df.shape[0]
    for i in tqdm(range(df.shape[0])):
        user1 = df['user1'].iloc[i]
        user2 = df['user2'].iloc[i]
        user_1_info = take_all_user_info(user1)
        user_2_info = take_all_user_info(user2)
        records[i] = ([
            profile_closed_cnt(user_1_info, user_2_info),
            max(posts_cnt(user_1_info, user_2_info)),
            min(posts_cnt(user_1_info, user_2_info)),
            is_same_city(user_1_info, user_2_info),
            is_same_sex(user_1_info, user_2_info),
            max(followers_cnt(user_1_info, user_2_info)),
            min(followers_cnt(user_1_info, user_2_info)),
            max(friends_cnt(user_1_info, user_2_info)),
            min(friends_cnt(user_1_info, user_2_info)),
            common_friends_cnt(user1, user2),
            most_friends_sex(user_1_info, user_2_info),
            same_school_cnt(user_1_info, user_2_info),
            len(same_university_cnt(user_1_info, user_2_info)),
            was_on_same_faculty(user_1_info, user_2_info)
        ])
    
    return pd.DataFrame(preprocessing.normalize(pd.DataFrame(records, columns=test_column_names())), columns=test_column_names())


test_data = add_test_features(df_test)

100%|██████████| 706407/706407 [1:16:34<00:00, 153.74it/s]


In [None]:
train_x, train_y = separate_x_and_y(train_k_data.copy())
train_x = pd.DataFrame(preprocessing.normalize(train_x.copy()), columns=test_column_names())


clf2 = RandomForestClassifier(n_estimators=400, max_depth=15)
clf3 = RandomForestClassifier(n_estimators=200, max_depth=20)
clf4 = KNeighborsClassifier(n_neighbors=11)
svc = VotingClassifier(estimators=[('rf', clf2), ('rf2', clf3), ('knn', clf4)], voting='soft')

svc_model = svc.fit(train_x, train_y)
svc_predictions = svc.predict(test_data)

In [None]:
pd.DataFrame(svc_predictions).to_csv(CSV_PATH_SAVE1)

In [None]:
train_x, train_y = separate_x_and_y(train_k_data.copy())
train_x = pd.DataFrame(preprocessing.normalize(train_x.copy()), columns=test_column_names())

clf2 = RandomForestClassifier(n_estimators=100)
clf3 = RandomForestClassifier(n_estimators=120)
clf4 = RandomForestClassifier(n_estimators=50)

svc = VotingClassifier(estimators=[('rf', clf2), ('rf2', clf3), ('rf3', clf4)], voting='soft')

pd.DataFrame(svc_predictions).to_csv(CSV_PATH_SAVE2)