## Preprocess train data

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('train.csv', delimiter=';').drop_duplicates()
sex_labels = pd.read_csv('train_labels.csv', delimiter=';').drop_duplicates()
df = df.join(sex_labels.set_index('user_id'), on='user_id', how='inner')
del sex_labels

In [3]:
df.shape

(591836, 6)

In [4]:
df.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,user_agent,target
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799,"{'browser': 'Chrome Mobile', 'browser_version'...",0
1,1700986581,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257,"{'browser': 'Chrome Mobile', 'browser_version'...",0
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150,"{'browser': 'Yandex Browser', 'browser_version...",0
3,1700992803,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740,"{'browser': 'Chrome Mobile', 'browser_version'...",0
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863,"{'browser': 'Yandex Browser', 'browser_version...",0


In [5]:
geo_id = pd.read_csv('geo_info.csv', delimiter=';')

In [6]:
geo_id.head()

Unnamed: 0,geo_id,country_id,region_id,timezone_id
0,6447,c31b4e,470e75,f6155e
1,8730,a0a6e9,,d816ca
2,7769,e878d4,,ec4385
3,7330,c31b4e,23f9c2,f6155e
4,600,c31b4e,6dbc37,e56e80


In [7]:
df = df.join(geo_id.set_index('geo_id'), on='geo_id')

In [8]:
df.shape

(591836, 9)

In [9]:
df.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,user_agent,target,country_id,region_id,timezone_id
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799,"{'browser': 'Chrome Mobile', 'browser_version'...",0,c31b4e,470e75,f6155e
1,1700986581,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257,"{'browser': 'Chrome Mobile', 'browser_version'...",0,c31b4e,44520b,e56e80
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150,"{'browser': 'Yandex Browser', 'browser_version...",0,c31b4e,616bb9,af47f1
3,1700992803,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740,"{'browser': 'Chrome Mobile', 'browser_version'...",0,c31b4e,3c9dca,e56e80
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863,"{'browser': 'Yandex Browser', 'browser_version...",0,c31b4e,776e76,10b7947


In [10]:
referer = pd.read_csv('referer_vectors.csv', delimiter=';').drop_duplicates()

In [11]:
referer.head()

Unnamed: 0,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,referer
0,16708,-3741,11395,-1597,-3212,6269,5610,-15351,13779,14102,https://a6899a4/15652e67
1,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,https://9b48ee5/
2,10551,2947,12282,-470,16222,4472,-3316,9606,4197,18948,https://7a4c700/161af7e3
3,12816,20498,-10110,7731,-569,12035,3014,6398,11439,-271,https://9653126/159bc361
4,3710,11096,11333,14673,8030,1852,10554,11625,4306,13210,https://72879b4/125c29e6


In [12]:
referer.shape

(198844, 11)

In [13]:
df = df.join(referer.set_index('referer'), on='referer')

In [14]:
df.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,user_agent,target,country_id,region_id,timezone_id,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799,"{'browser': 'Chrome Mobile', 'browser_version'...",0,c31b4e,470e75,f6155e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
1,1700986581,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257,"{'browser': 'Chrome Mobile', 'browser_version'...",0,c31b4e,44520b,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150,"{'browser': 'Yandex Browser', 'browser_version...",0,c31b4e,616bb9,af47f1,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870
3,1700992803,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740,"{'browser': 'Chrome Mobile', 'browser_version'...",0,c31b4e,3c9dca,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863,"{'browser': 'Yandex Browser', 'browser_version...",0,c31b4e,776e76,10b7947,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817


In [15]:
def extract_user_agent_info(user_agent_str):
    try:
        user_agent_dict = eval(user_agent_str)
        browser = user_agent_dict.get('browser', 'Unknown')
        os = user_agent_dict.get('os', 'Unknown')
        browser_version = user_agent_dict.get('browser_version', 'Unknown')
        os_version = user_agent_dict.get('os_version', 'Unknown')
        return pd.Series([browser, os, browser_version, os_version])
    except:
        return pd.Series(['Unknown', 'Unknown', 'Unknown', 'Unknown'])
df[['browser', 'os', 'browser_version', 'os_version']] = df['user_agent'].apply(extract_user_agent_info)

df = df.drop(columns=['user_agent'])
df.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,target,country_id,region_id,timezone_id,component0,component1,...,component4,component5,component6,component7,component8,component9,browser,os,browser_version,os_version
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799,0,c31b4e,470e75,f6155e,11731,4045,...,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,Android,119.0.0,10
1,1700986581,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257,0,c31b4e,44520b,e56e80,11731,4045,...,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,Android,111.0.0,10
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150,0,c31b4e,616bb9,af47f1,12498,2451,...,11608,3106,-2188,10573,3347,21870,Yandex Browser,Android,20.12.5,11
3,1700992803,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740,0,c31b4e,3c9dca,e56e80,11731,4045,...,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,Android,119.0.0,10
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863,0,c31b4e,776e76,10b7947,11731,4045,...,-8992,9381,-3496,-3120,-899,16817,Yandex Browser,Android,18.11.1,4.4.4


In [16]:
df['referer'].apply(lambda x: x.split('/')[-2]).value_counts()#.describe()

72879b4    51716
6a81948    46724
8807153    30325
9b08d64    26365
9f1218f    25443
           ...  
65ad3ab        1
8a4cb1e        1
ad93f29        1
85497da        1
be7a853        1
Name: referer, Length: 4998, dtype: int64

In [57]:
df['geo_id'].value_counts().describe()

count     2828.000000
mean       209.277228
std       2140.374942
min          1.000000
25%          2.000000
50%          9.000000
75%         41.000000
max      95172.000000
Name: count, dtype: float64

In [55]:
df['referer'].value_counts().describe()

count    135012.000000
mean          4.383581
std         149.706592
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max       29633.000000
Name: count, dtype: float64

In [17]:
# from sklearn.preprocessing import LabelEncoder
# browser_version = LabelEncoder()
# df['browser_version'] = browser_version.fit_transform(df['browser_version'])
# browser_version = LabelEncoder()
# df['os'] = browser_version.fit_transform(df['os'])
# browser_version = LabelEncoder()
# df['os_version'] = browser_version.fit_transform(df['os_version'])

In [24]:
df.shape

(591836, 22)

In [44]:
# from category_encoders.binary import BinaryEncoder
# bn = BinaryEncoder()
# bn.fit_transform(data.values)

In [49]:
browser_dummies = pd.get_dummies(df['browser'], dtype=float, prefix='browser_')
browser_version_dummies = pd.get_dummies(df['browser_version'], dtype=float, prefix='browser_version')
os_dummies = pd.get_dummies(df['os'], dtype=float, prefix='os_')
os_version_dummies = pd.get_dummies(df['os_version'], dtype=float, prefix='os_version')

In [70]:
country_dummies = pd.get_dummies(df['country_id'], dtype=float, prefix='country_')
region_dummies = pd.get_dummies(df['region_id'], dtype=float, prefix='region')
timezone_dummies = pd.get_dummies(df['timezone_id'], dtype=float, prefix='timezone_')

In [71]:
pca = PCA(n_components=10)
country_dummies = pd.DataFrame(data=pca.fit_transform(country_dummies), index=df.index, columns=[f'country_{i}' for i in range(10)])
pca = PCA(n_components=10)
region_dummies = pd.DataFrame(data=pca.fit_transform(region_dummies), index=df.index, columns=[f'region_{i}' for i in range(10)])
pca = PCA(n_components=10)
timezone_dummies = pd.DataFrame(data=pca.fit_transform(timezone_dummies), index=df.index, columns=[f'timezone_{i}' for i in range(10)])
df = df.join(country_dummies).join(region_dummies).join(timezone_dummies)

In [50]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
browser_dummies = pd.DataFrame(data=pca.fit_transform(browser_dummies), index=df.index, columns=[f'browser_{i}' for i in range(10)])
pca = PCA(n_components=10)
browser_version_dummies = pd.DataFrame(data=pca.fit_transform(browser_version_dummies), index=df.index, columns=[f'browser_version_{i}' for i in range(10)])
pca = PCA(n_components=10)
os_dummies = pd.DataFrame(data=pca.fit_transform(os_dummies), index=df.index, columns=[f'os_{i}' for i in range(10)])
pca = PCA(n_components=10)
os_version_dummies = pd.DataFrame(data=pca.fit_transform(os_version_dummies), index=df.index, columns=[f'os_version_{i}' for i in range(10)])
df = df.join(browser_dummies).join(browser_version_dummies).join(os_dummies).join(os_version_dummies)

In [22]:
%matplotlib inline
df[df['browser_version'].value_counts() < 10]
# df = df.drop(df[df['browser_version'] < ].index)

  df[df['browser_version'].value_counts() < 10]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [72]:
df.columns

Index(['request_ts', 'user_id', 'referer', 'geo_id', 'target', 'country_id',
       'region_id', 'timezone_id', 'component0', 'component1', 'component2',
       'component3', 'component4', 'component5', 'component6', 'component7',
       'component8', 'component9', 'browser', 'os', 'browser_version',
       'os_version', 'browser_0', 'browser_1', 'browser_2', 'browser_3',
       'browser_4', 'browser_5', 'browser_6', 'browser_7', 'browser_8',
       'browser_9', 'browser_version_0', 'browser_version_1',
       'browser_version_2', 'browser_version_3', 'browser_version_4',
       'browser_version_5', 'browser_version_6', 'browser_version_7',
       'browser_version_8', 'browser_version_9', 'os_0', 'os_1', 'os_2',
       'os_3', 'os_4', 'os_5', 'os_6', 'os_7', 'os_8', 'os_9', 'os_version_0',
       'os_version_1', 'os_version_2', 'os_version_3', 'os_version_4',
       'os_version_5', 'os_version_6', 'os_version_7', 'os_version_8',
       'os_version_9', 'country_0', 'country_1', 'country

In [16]:
df = df.dropna(subset='target')

In [17]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['target', 'user_id', 'referer', 'request_ts', 'country_id', 'region_id', 'timezone_id', 'os', 'os_version', 'browser', 'browser_version']).values
y = df['target']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [18]:
df.columns

Index(['request_ts', 'user_id', 'referer', 'geo_id', 'target', 'country_id',
       'region_id', 'timezone_id', 'component0', 'component1', 'component2',
       'component3', 'component4', 'component5', 'component6', 'component7',
       'component8', 'component9', 'browser', 'os', 'browser_version',
       'os_version'],
      dtype='object')

In [30]:
df2 = df.drop(columns=['user_id', 'referer', 'request_ts', 'country_id', 'region_id', 'timezone_id']).corr()

ValueError: could not convert string to float: 'Chrome Mobile'

In [None]:
import matplotlib.pyplot as plt

plt.matshow(df2)
plt.show()

In [19]:
scale = StandardScaler()
scaled_x_train = scale.fit_transform(x_train)
scaled_x_test = scale.fit_transform(x_test)

In [20]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
result_bayes = nb.fit(scaled_x_train, y_train)
nb.score(scaled_x_test,y_test)

0.6511423100160774

In [21]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
result_logreg = logreg.fit(scaled_x_train, y_train)
logreg.score(scaled_x_test,y_test)

0.6725958239890224

In [None]:
from sklearn import svm
metodsvm = svm.SVC()
result_svm = metodsvm.fit(x_train, y_train)
metodsvm.score(x_test, y_test)

In [22]:
#адаптивный бустинг

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

modelClf = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, random_state=42)
modelclf_fit = modelClf.fit(scaled_x_train, y_train)
modelClf.score(scaled_x_test, y_test)



0.7090719179134282

In [23]:
from sklearn.ensemble import GradientBoostingClassifier

modelClf = GradientBoostingClassifier(max_depth=2, n_estimators=150,random_state=12, learning_rate=1)


modelClf.fit(scaled_x_train, y_train)
modelClf.score(scaled_x_test, y_test)

0.7433924200997409

In [89]:
len(scaled_x_train[0])

81

In [24]:
from keras import models
from keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [25]:
model = models.Sequential()
model.add(layers.Dense(128,activation='relu',input_shape=(11,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

In [26]:
from keras import optimizers
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [27]:
history=model.fit(scaled_x_train, y_train, epochs=40,batch_size=512,validation_data=(scaled_x_test,y_test))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [59]:
history_dict = history.history
history_dict.keys()
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

NameError: name 'dict_keys' is not defined

## Preprocessing for tests

In [28]:
df_test = pd.read_csv('test.csv', delimiter=';').drop_duplicates()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149737 entries, 0 to 149999
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   request_ts  149737 non-null  int64 
 1   user_id     149737 non-null  object
 2   referer     149737 non-null  object
 3   geo_id      149737 non-null  int64 
 4   user_agent  149737 non-null  object
dtypes: int64(2), object(3)
memory usage: 6.9+ MB


In [29]:
geo_id = pd.read_csv('geo_info.csv', delimiter=';')
df_test = df_test.join(geo_id.set_index('geo_id'), on='geo_id')

In [30]:
referer = pd.read_csv('referer_vectors.csv', delimiter=';').drop_duplicates()
df_test = df_test.join(referer.set_index('referer'), on='referer')

In [33]:
def extract_user_agent_info(user_agent_str):
    try:
        user_agent_dict = eval(user_agent_str)
        browser = user_agent_dict.get('browser', 'Unknown')
        os = user_agent_dict.get('os', 'Unknown')
        browser_version = user_agent_dict.get('browser_version', 'Unknown')
        os_version = user_agent_dict.get('os_version', 'Unknown')
        return pd.Series([browser, os, browser_version, os_version])
    except:
        return pd.Series(['Unknown', 'Unknown', 'Unknown', 'Unknown'])
df_test[['browser', 'os', 'browser_version', 'os_version']] = df_test['user_agent'].apply(extract_user_agent_info)

df_test = df_test.drop(columns=['user_agent'])
df_test.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,country_id,region_id,timezone_id,component0,component1,component2,...,component4,component5,component6,component7,component8,component9,browser,os,browser_version,os_version
0,1700993094,c2802dadd33d8ae09bb366bdd41212ea,https://9b48ee5/,8816,c31b4e,36e3f3,f6155e,11731,4045,22213,...,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,Android,96.0.4664,12
1,1701005579,e5b1988db74527ec092f28b0bbfdaac9,https://9b48ee5/,3663,c31b4e,8ccc01,e56e80,11731,4045,22213,...,-8992,9381,-3496,-3120,-899,16817,Chrome,Android,116.0.5845,10
2,1700969752,6ef1eedbdb72554e53e69782066065c5,https://72879b4/12411b9e,2336,c31b4e,1fbfa5,e56e80,-7307,11682,9741,...,13577,1200,10169,16461,-3932,3340,Chrome,Android,114.0.0,10
3,1700991608,7e057293ecae62985a327b7af51858ea,https://9b48ee5/,9652,c31b4e,f66ff,f6155e,11731,4045,22213,...,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,Android,91.0.4472,11
4,1701019815,a27bd7ce8828497823fa8d5d05e7bbf7,https://9b48ee5/,3871,c31b4e,245864,e56e80,11731,4045,22213,...,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,Android,119.0.0,10


In [100]:
browser_dummies = pd.get_dummies(df_test['browser'], dtype=float, prefix='browser_')
browser_version_dummies = pd.get_dummies(df_test['browser_version'], dtype=float, prefix='browser_version')
os_dummies = pd.get_dummies(df_test['os'], dtype=float, prefix='os_')
os_version_dummies = pd.get_dummies(df_test['os_version'], dtype=float, prefix='os_version')

In [101]:
country_dummies = pd.get_dummies(df_test['country_id'], dtype=float, prefix='country_')
region_dummies = pd.get_dummies(df_test['region_id'], dtype=float, prefix='region')
timezone_dummies = pd.get_dummies(df_test['timezone_id'], dtype=float, prefix='timezone_')

In [102]:
pca = PCA(n_components=10)
country_dummies = pd.DataFrame(data=pca.fit_transform(country_dummies), index=df_test.index, columns=[f'country_{i}' for i in range(10)])
pca = PCA(n_components=10)
region_dummies = pd.DataFrame(data=pca.fit_transform(region_dummies), index=df_test.index, columns=[f'region_{i}' for i in range(10)])
pca = PCA(n_components=10)
timezone_dummies = pd.DataFrame(data=pca.fit_transform(timezone_dummies), index=df_test.index, columns=[f'timezone_{i}' for i in range(10)])
df_test = df_test.join(country_dummies).join(region_dummies).join(timezone_dummies)

In [103]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
browser_dummies = pd.DataFrame(data=pca.fit_transform(browser_dummies), index=df_test.index, columns=[f'browser_{i}' for i in range(10)])
pca = PCA(n_components=10)
browser_version_dummies = pd.DataFrame(data=pca.fit_transform(browser_version_dummies), index=df_test.index, columns=[f'browser_version_{i}' for i in range(10)])
pca = PCA(n_components=10)
os_dummies = pd.DataFrame(data=pca.fit_transform(os_dummies), index=df_test.index, columns=[f'os_{i}' for i in range(10)])
pca = PCA(n_components=10)
os_version_dummies = pd.DataFrame(data=pca.fit_transform(os_version_dummies), index=df_test.index, columns=[f'os_version_{i}' for i in range(10)])
df_test = df_test.join(browser_dummies).join(browser_version_dummies).join(os_dummies).join(os_version_dummies)

In [34]:
from sklearn.model_selection import train_test_split
X = df_test.drop(columns=['user_id', 'referer', 'request_ts', 'country_id', 'region_id', 'timezone_id', 'os', 'os_version', 'browser', 'browser_version']).values

In [35]:
scale = StandardScaler()
scaled_X = scale.fit_transform(X)

In [111]:
preds.flatten() 

array([0., 0., 0., ..., 1., 0., 1.], dtype=float32)

In [40]:
preds = modelClf.predict(scaled_X)
result = pd.DataFrame({'user_id': df_test['user_id'], 'target': preds.flatten().astype('int64')})

In [41]:
result.head()

Unnamed: 0,user_id,target
0,c2802dadd33d8ae09bb366bdd41212ea,0
1,e5b1988db74527ec092f28b0bbfdaac9,0
2,6ef1eedbdb72554e53e69782066065c5,0
3,7e057293ecae62985a327b7af51858ea,0
4,a27bd7ce8828497823fa8d5d05e7bbf7,0


In [42]:
result.to_csv("finale.csv", index=False, sep=';')

In [43]:
pd.read_csv('finale.csv', delimiter=';').head()

Unnamed: 0,user_id,target
0,c2802dadd33d8ae09bb366bdd41212ea,0
1,e5b1988db74527ec092f28b0bbfdaac9,0
2,6ef1eedbdb72554e53e69782066065c5,0
3,7e057293ecae62985a327b7af51858ea,0
4,a27bd7ce8828497823fa8d5d05e7bbf7,0
