In [1]:
import os
import numpy as np
import pandas as pd

root = '/kaggle/input/garanti-bbva-data-camp/'
target = 'moved_after_2019'
idx = 'user_id'

In [2]:
import warnings
warnings.simplefilter(action="ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 170)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
df_train = pd.read_csv(os.path.join(root, 'train_users.csv'))
df_test  = pd.read_csv(os.path.join(root, 'test_users.csv'))
df_subm  = pd.read_csv(os.path.join(root, 'submission.csv'))
df_lang  = pd.read_csv(os.path.join(root, 'languages.csv'))
df_edu   = pd.read_csv(os.path.join(root, 'education.csv'))
df_skills = pd.read_csv(os.path.join(root, 'skills.csv'))
df_exp   = pd.read_csv(os.path.join(root, 'work_experiences.csv'))

In [4]:
df_train = df_train.set_index(idx)
df_test = df_test.set_index(idx)
df_subm = df_subm.set_index(idx)

df_exp = df_exp[df_exp['start_year_month'] < 201901] # see https://www.kaggle.com/competitions/garanti-bbva-data-camp/discussion/383774

## DataPreprocessing - EDU

In [5]:
df_edu.head()

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,0,Anadolu Üniversitesi,,,,
1,0,Adıyaman Üniversitesi,,,,
2,0,Fırat Üniversitesi,,,,
3,1,Fırat Üniversitesi,Yüksek Lisans,Yazılım Mühendisliği,,
4,1,Fırat Üniversitesi,Lisans,Yazılım Mühendisliği,,


In [6]:
df_edu = df_edu[df_edu['school_name'].notnull() & df_edu['degree'].notnull()]

In [7]:
df_edu.loc[df_edu['degree'].str.contains("Associate|Ön|ön lisans|önlisans", na=False), 'degree'] = "ÖNLİSANS"
df_edu.loc[df_edu['degree'].str.contains("Doctor|Doktor|Ph", na=False), 'degree'] = "DOKTORA"
df_edu.loc[df_edu['degree'].str.contains("Yüksek|Master|MSc|MS|M.Sc.|MBA|Msc|M.Sc|M.S.|M.S", na=False), 'degree'] = "YÜKSEK_LİSANS"
df_edu.loc[df_edu['degree'].str.contains("BS|Bs|Bachelor|BSc|BE|B.E.|B.Sc.|B.S.|B.S|Engineer|BA|BBA|BEng|B.B.A.|B.A.Sc.|Undergraduate|Licentiate|Licence|License|Lisans|lisans|Bacheleer|bachelor|Bsc|Mezun|Graduate", na=False), 'degree'] = "LİSANS"
df_edu.loc[df_edu['degree'].str.contains("Öğrenci|Student", na=False), 'degree'] = "Öğrenci"

df_edu.loc[~df_edu["degree"].isin(["ÖNLİSANS","DOKTORA","YÜKSEK_LİSANS","LİSANS","Öğrenci"]), "degree"] = "Diğer"

In [8]:
df_edu["degree"].value_counts().head(50)

LİSANS           56289
YÜKSEK_LİSANS    21546
Diğer            20550
ÖNLİSANS          5316
DOKTORA           2572
Öğrenci            460
Name: degree, dtype: int64

In [9]:
df_edu['school_name'].value_counts().head(20)

Anadolu Üniversitesi                4620
İstanbul Üniversitesi               3657
Sakarya Üniversitesi                3433
İstanbul Teknik Üniversitesi        3210
Istanbul Technical University       3203
Yıldız Teknik Üniversitesi          3081
Kocaeli Üniversitesi                3042
Marmara Üniversitesi                2835
Hacettepe Üniversitesi              2558
Gazi Üniversitesi                   2364
Ege Üniversitesi                    2291
Bahçeşehir Üniversitesi             2060
Dokuz Eylül Üniversitesi            1846
Istanbul University                 1698
Yildiz Technical University         1697
Selçuk Üniversitesi                 1426
Beykent Üniversitesi                1419
Ankara Üniversitesi                 1419
Süleyman Demirel Üniversitesi       1348
Eskişehir Osmangazi Üniversitesi    1304
Name: school_name, dtype: int64

In [10]:
df_edu.loc[df_edu['school_name'].str.contains("Oxford|Kaliforniya|Harvard|Stanford|Cambridge|Massachusetts|Princeton|Kaliforniya| Berkeley|Yale|Chicago|Kolombiya|Imperial|Johns Hopkins|Pensilvanya|ETH Zürih|Pekin|Tsinghua|Toronto|Londra", na=False), 'school_name'] = "TheFirstLevel(World)"
df_edu.loc[df_edu['school_name'].str.contains("Çankaya|Koç|Sabancı|ODTÜ|Bahçeşehir|Hacettepe|İstanbul Teknik|Istanbul Technical University|Bilkent|Boğaziçi|Düzce|Fırat|İstanbul Medeniyet|Özyeğin|Cankaya|Koc|Sabanci|Odtu|Bahcesehir|Istanbul Teknik|Bogazici|Düzce|Firat|Istanbul Medeniyet|Ozyegin", na=False), 'school_name'] = "TheFirstLevel(TR)"
df_edu.loc[df_edu['school_name'].str.contains("İstanbul Üniversitesi|Istanbul University|Ankara|Ege|İhsan Doğramacı|Bilkent|Gebze|Gazi|Yıldız|Yildiz Technical University|Sabancı|İzmir Yüksek Teknoloji|Atatürk|Bezm-i Alem|Erciyes|Marmara|Dokuz Eylül|Selçuk|Çukurova|Karadeniz Teknik|Eskişehir Osmangazi|Akdeniz|Abdullah Gül|Bursa Uludağ|Ondokuz Mayıs|İnönü|Anadolu", na=False), 'school_name'] = "TheSecondLevel(TR)"
df_edu.loc[df_edu['school_name'].str.contains("Süleyman Demirel|Gaziantep|Sakarya|Çankaya|Kocaeli|Van Yüzüncü|İzmir Katip Çelebi|Yıldırım Beyazıt|Başkent|Atılım|Dicle|Manisa Celâl Bayar|Pamukkale|Tobb Ekonomi Ve Teknoloji|Acıbadem Mehmet Ali Aydınlar|Mersin|Yeditepe", na=False), 'school_name'] = "TheLowLevel(TR)"

In [11]:
df_edu.loc[df_edu['school_name'].str.contains("Orta Doğu Teknik Üniversitesi|Middle East Technical University", na=False), 'school_name'] = "TheFirstLevel(TR)"
df_edu.loc[df_edu['school_name'].str.contains("Karadeniz Technical University|Dokuz Eylul University|Uludağ Üniversitesi|Uludag University|Ataturk|Istanbul Üniversitesi|İstanbul University|Eskisehir Osmangazi University", na=False), 'school_name'] = "TheSecondLevel(TR)"

In [12]:
df_edu.loc[~df_edu["school_name"].isin(["TheFirstLevel(World)","TheFirstLevel(TR)","TheSecondLevel(TR)","TheLowLevel(TR)"]), "school_name"] = "Diğer"

In [13]:
df_edu['school_name'].value_counts().head(20) # .groupby('school_name')['start_year_month'].max() ]

TheSecondLevel(TR)      46015
TheFirstLevel(TR)       28523
Diğer                   24135
TheLowLevel(TR)          7852
TheFirstLevel(World)      208
Name: school_name, dtype: int64

In [14]:
df_edu['fields_of_study'].value_counts().head(40)

Bilgisayar Mühendisliği                                   12473
Computer Engineering                                      11404
Elektrik ve Elektronik Mühendisliği                        3091
Electrical and Electronics Engineering                     2984
Computer Science                                           2771
Bilgisayar Programlama                                     1322
Mechanical Engineering                                     1143
Computer Programming                                       1107
Yönetim Bilişim Sistemleri                                 1107
Industrial Engineering                                     1066
İşletme                                                     914
İşletme ve Yönetim, Genel                                   849
Business Administration and Management, General             846
Software Engineering                                        781
Makine Mühendisliği                                         751
Computer Software Engineering           

In [15]:
df_edu.loc[df_edu['fields_of_study'].str.contains("Computer Engineer", na=False), 'fields_of_study'] = "Bilgisayar Mühendisliği"
df_edu.loc[df_edu['fields_of_study'].str.contains("Electronic Engineering|Electrical and Electronic Engineering|Electrical and Electronic Engineering|Electronics Engineering|Communication Engineering|Elektrik Mühendisliği|Telecommunications Engineering|Telekomünikasyon Mühendisliği|Elektrik, Elektronik ve İletişim Mühendisliği|Elektronik Mühendisliği|Electrical Engineering|Electrical, Electronics and Communications Engineering|Elektronik ve Haberleşme Mühendisliği|Electronics and Communication Engineering|Electronics and Communications Engineering", na=False), 'fields_of_study'] = "Elektrik ve Elektronik Mühendisliği"
df_edu.loc[df_edu['fields_of_study'].str.contains("Mechanical Engineering", na=False), 'fields_of_study'] = "Makine Mühendisliği"
df_edu.loc[df_edu['fields_of_study'].str.contains("İşletme ve Yönetim, Genel|Business Administration|Business Administration|Business|MBA", na=False), 'fields_of_study'] = "İşletme"
df_edu.loc[df_edu['fields_of_study'].str.contains("Industrial Engineering", na=False), 'fields_of_study'] = "Endüstri Mühendisliği"

df_edu.loc[df_edu['fields_of_study'].str.contains("Mathematics", na=False), 'fields_of_study'] = "Matematik"
df_edu.loc[df_edu['fields_of_study'].str.contains("Bilgisayar Teknolojileri ve Programlama|Management Information Systems|Information Technolog|Yönetim Bilgi Sistemleri|Information Systems|Bilişim Teknolojileri|Bilişim Sistemleri|Informatics|Bilgi Teknolojisi|Management Information System|Computer Science|Computer Programming|Bilgisayar Program|Bilgisayar Teknolojisi|Matematik ve Bilgisayar Bilimleri|Computer Technology and Programming", na=False), 'fields_of_study'] = "Yönetim Bilişim Sistemleri"
df_edu.loc[df_edu['fields_of_study'].str.contains("Software Engineer|Bilgisayar Yazılımı Mühendisliği", na=False), 'fields_of_study'] = "Yazılım Mühendisliği"

df_edu.loc[df_edu['fields_of_study'].str.contains("Economics|Ekonomi", na=False), 'fields_of_study'] = "İktisat"
df_edu.loc[df_edu['fields_of_study'].str.contains("Chemical Engineering", na=False), 'fields_of_study'] = "Kimya Mühendisliği"
df_edu.loc[df_edu['fields_of_study'].str.contains("Mekatronik, Robotik ve Otomasyon Mühendisliği|Mechatronics Engineering|Control and Automation Engineering|Automation Engineering|Otomasyon Mühendisliği", na=False), 'fields_of_study'] = "Mekatronik Mühendisliği"
df_edu.loc[df_edu['fields_of_study'].str.contains("Civil Engineering", na=False), 'fields_of_study'] = "İnşaat Mühendisliği"
df_edu.loc[df_edu['fields_of_study'].str.contains("Physics|Physical Sciences", na=False), 'fields_of_study'] = "Fizik"

df_edu.loc[df_edu['fields_of_study'].str.contains("Chemistry|Kimya", na=False), 'fields_of_study'] = "Kimya"
df_edu.loc[df_edu['fields_of_study'].str.contains("Mathematical Engineering", na=False), 'fields_of_study'] = "Matematik Mühendisliği"
df_edu.loc[df_edu['fields_of_study'].str.contains("Statistics", na=False), 'fields_of_study'] = "İstatistik"
df_edu.loc[df_edu['fields_of_study'].str.contains("Food Engineering", na=False), 'fields_of_study'] = "Gıda Mühendisliği"

df_edu.loc[df_edu['fields_of_study'].str.contains("Engineering Management|Engineering/Industrial Management|Management", na=False), 'fields_of_study'] = "Mühendislik Yönetimi"
df_edu.loc[df_edu['fields_of_study'].str.contains("International Relations|International Relations and Affairs|Uluslararası İlişkiler", na=False), 'fields_of_study'] = "Uluslararası İlişkiler"
df_edu.loc[df_edu['fields_of_study'].str.contains("Law", na=False), 'fields_of_study'] = "Avukat"
df_edu.loc[df_edu['fields_of_study'].str.contains("Bilgisayar ve Öğretim Teknolojileri Öğretmenliği", na=False), 'fields_of_study'] = "Bilgisayar Öğretmenliği"
df_edu.loc[df_edu['fields_of_study'].str.contains("Artificial Intelligence", na=False), 'fields_of_study'] = "Yapay Zeka"
df_edu.loc[df_edu['fields_of_study'].str.contains("Metallurgical and Materials Engineering", na=False), 'fields_of_study'] = "Metalurji ve Malzeme Mühendisliği"

In [16]:
df_edu.loc[~df_edu["fields_of_study"].isin(["Bilgisayar Mühendisliği","Elektrik ve Elektronik Mühendisliği","Makine Mühendisliği","İşletme","Endüstri Mühendisliği","Yönetim Bilişim Sistemleri","Yazılım Mühendisliği","İktisat","Kimya Mühendisliği","Mekatronik Mühendisliği","İnşaat Mühendisliği","Fizik","Kimya","Matematik Mühendisliği","İstatistik","Gıda Mühendisliği","Mühendislik Yönetimi","Uluslararası İlişkiler","Avukat","Bilgisayar Öğretmenliği","Yapay Zeka","Metalurji ve Malzeme Mühendisliği"]), "fields_of_study"] = "Diğer"

In [17]:
df_edu['fields_of_study'].value_counts().head(40)

Diğer                                  35819
Bilgisayar Mühendisliği                24754
Yönetim Bilişim Sistemleri             14479
Elektrik ve Elektronik Mühendisliği    10769
İşletme                                 4754
Yazılım Mühendisliği                    2712
Makine Mühendisliği                     2012
Endüstri Mühendisliği                   1910
Mühendislik Yönetimi                    1762
Mekatronik Mühendisliği                 1360
Kimya                                   1302
Fizik                                   1046
İktisat                                  960
Matematik Mühendisliği                   662
İnşaat Mühendisliği                      492
Uluslararası İlişkiler                   471
İstatistik                               400
Bilgisayar Öğretmenliği                  310
Gıda Mühendisliği                        271
Metalurji ve Malzeme Mühendisliği        177
Avukat                                   159
Yapay Zeka                               152
Name: fiel

In [18]:
df_edu.head()

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
3,1,TheFirstLevel(TR),YÜKSEK_LİSANS,Yazılım Mühendisliği,,
4,1,TheFirstLevel(TR),LİSANS,Yazılım Mühendisliği,,
5,2,TheFirstLevel(TR),Diğer,Bilgisayar Mühendisliği,,
6,2,TheSecondLevel(TR),Diğer,Diğer,,
7,3,TheFirstLevel(TR),LİSANS,Yazılım Mühendisliği,,


In [19]:
df_edu.drop(columns=["start_year_month","end_year_month"], inplace = True)

In [20]:
df_edu.head()

Unnamed: 0,user_id,school_name,degree,fields_of_study
3,1,TheFirstLevel(TR),YÜKSEK_LİSANS,Yazılım Mühendisliği
4,1,TheFirstLevel(TR),LİSANS,Yazılım Mühendisliği
5,2,TheFirstLevel(TR),Diğer,Bilgisayar Mühendisliği
6,2,TheSecondLevel(TR),Diğer,Diğer
7,3,TheFirstLevel(TR),LİSANS,Yazılım Mühendisliği


In [21]:
df_edu = df_edu.drop_duplicates(['user_id', 'degree','fields_of_study'])

In [22]:
df_edu.head()

Unnamed: 0,user_id,school_name,degree,fields_of_study
3,1,TheFirstLevel(TR),YÜKSEK_LİSANS,Yazılım Mühendisliği
4,1,TheFirstLevel(TR),LİSANS,Yazılım Mühendisliği
5,2,TheFirstLevel(TR),Diğer,Bilgisayar Mühendisliği
6,2,TheSecondLevel(TR),Diğer,Diğer
7,3,TheFirstLevel(TR),LİSANS,Yazılım Mühendisliği


In [23]:
df_edu["Degree_School"] = df_edu['school_name'] + "_" + df_edu['degree']

In [24]:
df_edu.head()

Unnamed: 0,user_id,school_name,degree,fields_of_study,Degree_School
3,1,TheFirstLevel(TR),YÜKSEK_LİSANS,Yazılım Mühendisliği,TheFirstLevel(TR)_YÜKSEK_LİSANS
4,1,TheFirstLevel(TR),LİSANS,Yazılım Mühendisliği,TheFirstLevel(TR)_LİSANS
5,2,TheFirstLevel(TR),Diğer,Bilgisayar Mühendisliği,TheFirstLevel(TR)_Diğer
6,2,TheSecondLevel(TR),Diğer,Diğer,TheSecondLevel(TR)_Diğer
7,3,TheFirstLevel(TR),LİSANS,Yazılım Mühendisliği,TheFirstLevel(TR)_LİSANS


In [25]:
df_edu.drop(columns=["school_name","degree"], inplace = True)

In [26]:
df_edu.head()

Unnamed: 0,user_id,fields_of_study,Degree_School
3,1,Yazılım Mühendisliği,TheFirstLevel(TR)_YÜKSEK_LİSANS
4,1,Yazılım Mühendisliği,TheFirstLevel(TR)_LİSANS
5,2,Bilgisayar Mühendisliği,TheFirstLevel(TR)_Diğer
6,2,Diğer,TheSecondLevel(TR)_Diğer
7,3,Yazılım Mühendisliği,TheFirstLevel(TR)_LİSANS


In [27]:
df_edu.columns

Index(['user_id', 'fields_of_study', 'Degree_School'], dtype='object')

In [28]:
df_edu1 = df_edu.drop_duplicates(['user_id', 'Degree_School'])

In [29]:
df_edu1 = df_edu1.pivot_table(index='user_id', columns='Degree_School', values='Degree_School',aggfunc='count')

In [30]:
df_edu1.head()

Degree_School,Diğer_DOKTORA,Diğer_Diğer,Diğer_LİSANS,Diğer_YÜKSEK_LİSANS,Diğer_ÖNLİSANS,Diğer_Öğrenci,TheFirstLevel(TR)_DOKTORA,TheFirstLevel(TR)_Diğer,TheFirstLevel(TR)_LİSANS,TheFirstLevel(TR)_YÜKSEK_LİSANS,TheFirstLevel(TR)_ÖNLİSANS,TheFirstLevel(TR)_Öğrenci,TheFirstLevel(World)_DOKTORA,TheFirstLevel(World)_Diğer,TheFirstLevel(World)_LİSANS,TheFirstLevel(World)_YÜKSEK_LİSANS,TheFirstLevel(World)_Öğrenci,TheLowLevel(TR)_DOKTORA,TheLowLevel(TR)_Diğer,TheLowLevel(TR)_LİSANS,TheLowLevel(TR)_YÜKSEK_LİSANS,TheLowLevel(TR)_ÖNLİSANS,TheLowLevel(TR)_Öğrenci,TheSecondLevel(TR)_DOKTORA,TheSecondLevel(TR)_Diğer,TheSecondLevel(TR)_LİSANS,TheSecondLevel(TR)_YÜKSEK_LİSANS,TheSecondLevel(TR)_ÖNLİSANS,TheSecondLevel(TR)_Öğrenci
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,
2,,,,,,,,1.0,,,,,,,,,,,,,,,,,1.0,,,,
3,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,
5,,,1.0,,,,,,,1.0,,,,,,,,,,,,,,,1.0,,,,


In [31]:
df_edu2 = df_edu.drop_duplicates(['user_id', 'fields_of_study'])

In [32]:
df_edu2 = df_edu2.pivot_table(index='user_id', columns='fields_of_study', values='fields_of_study',aggfunc='count')

In [33]:
df_edu2.head()

fields_of_study,Avukat,Bilgisayar Mühendisliği,Bilgisayar Öğretmenliği,Diğer,Elektrik ve Elektronik Mühendisliği,Endüstri Mühendisliği,Fizik,Gıda Mühendisliği,Kimya,Makine Mühendisliği,Matematik Mühendisliği,Mekatronik Mühendisliği,Metalurji ve Malzeme Mühendisliği,Mühendislik Yönetimi,Uluslararası İlişkiler,Yapay Zeka,Yazılım Mühendisliği,Yönetim Bilişim Sistemleri,İktisat,İnşaat Mühendisliği,İstatistik,İşletme
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,,,,,,,,,,,,,,,,,1.0,,,,,
2,,1.0,,1.0,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,1.0,,,,,
4,,,,,,,,,,,,,,,,,1.0,,,,,
5,,,,1.0,,,,,,,,,,,,,1.0,,,,,


In [34]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0,0.05,0.25 ,0.50,0.75 ,0.95, 0.99, 1]).T)
    print("##################### Describe #####################")
    print(dataframe.describe().T)
    print("##################### Nunique #####################")
    print(dataframe[dataframe.columns].nunique())

In [35]:
check_df(df_edu1)

##################### Shape #####################
(57701, 29)
##################### Types #####################
Degree_School
Diğer_DOKTORA                         float64
Diğer_Diğer                           float64
Diğer_LİSANS                          float64
Diğer_YÜKSEK_LİSANS                   float64
Diğer_ÖNLİSANS                        float64
Diğer_Öğrenci                         float64
TheFirstLevel(TR)_DOKTORA             float64
TheFirstLevel(TR)_Diğer               float64
TheFirstLevel(TR)_LİSANS              float64
TheFirstLevel(TR)_YÜKSEK_LİSANS       float64
TheFirstLevel(TR)_ÖNLİSANS            float64
TheFirstLevel(TR)_Öğrenci             float64
TheFirstLevel(World)_DOKTORA          float64
TheFirstLevel(World)_Diğer            float64
TheFirstLevel(World)_LİSANS           float64
TheFirstLevel(World)_YÜKSEK_LİSANS    float64
TheFirstLevel(World)_Öğrenci          float64
TheLowLevel(TR)_DOKTORA               float64
TheLowLevel(TR)_Diğer                 float64


In [36]:
check_df(df_edu2)

##################### Shape #####################
(57701, 22)
##################### Types #####################
fields_of_study
Avukat                                 float64
Bilgisayar Mühendisliği                float64
Bilgisayar Öğretmenliği                float64
Diğer                                  float64
Elektrik ve Elektronik Mühendisliği    float64
Endüstri Mühendisliği                  float64
Fizik                                  float64
Gıda Mühendisliği                      float64
Kimya                                  float64
Makine Mühendisliği                    float64
Matematik Mühendisliği                 float64
Mekatronik Mühendisliği                float64
Metalurji ve Malzeme Mühendisliği      float64
Mühendislik Yönetimi                   float64
Uluslararası İlişkiler                 float64
Yapay Zeka                             float64
Yazılım Mühendisliği                   float64
Yönetim Bilişim Sistemleri             float64
İktisat                   

In [37]:
df_education = df_edu1.merge(df_edu2, on="user_id", how="left")

In [38]:
# df_education = pd.concat([df_edu1, df_edu2])

In [39]:
df_education.head()

Unnamed: 0_level_0,Diğer_DOKTORA,Diğer_Diğer,Diğer_LİSANS,Diğer_YÜKSEK_LİSANS,Diğer_ÖNLİSANS,Diğer_Öğrenci,TheFirstLevel(TR)_DOKTORA,TheFirstLevel(TR)_Diğer,TheFirstLevel(TR)_LİSANS,TheFirstLevel(TR)_YÜKSEK_LİSANS,TheFirstLevel(TR)_ÖNLİSANS,TheFirstLevel(TR)_Öğrenci,TheFirstLevel(World)_DOKTORA,TheFirstLevel(World)_Diğer,TheFirstLevel(World)_LİSANS,TheFirstLevel(World)_YÜKSEK_LİSANS,TheFirstLevel(World)_Öğrenci,TheLowLevel(TR)_DOKTORA,TheLowLevel(TR)_Diğer,TheLowLevel(TR)_LİSANS,TheLowLevel(TR)_YÜKSEK_LİSANS,TheLowLevel(TR)_ÖNLİSANS,TheLowLevel(TR)_Öğrenci,TheSecondLevel(TR)_DOKTORA,TheSecondLevel(TR)_Diğer,TheSecondLevel(TR)_LİSANS,TheSecondLevel(TR)_YÜKSEK_LİSANS,TheSecondLevel(TR)_ÖNLİSANS,TheSecondLevel(TR)_Öğrenci,Avukat,Bilgisayar Mühendisliği,Bilgisayar Öğretmenliği,Diğer,Elektrik ve Elektronik Mühendisliği,Endüstri Mühendisliği,Fizik,Gıda Mühendisliği,Kimya,Makine Mühendisliği,Matematik Mühendisliği,Mekatronik Mühendisliği,Metalurji ve Malzeme Mühendisliği,Mühendislik Yönetimi,Uluslararası İlişkiler,Yapay Zeka,Yazılım Mühendisliği,Yönetim Bilişim Sistemleri,İktisat,İnşaat Mühendisliği,İstatistik,İşletme
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
1,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,
2,,,,,,,,1.0,,,,,,,,,,,,,,,,,1.0,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,
3,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,
4,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,
5,,,1.0,,,,,,,1.0,,,,,,,,,,,,,,,1.0,,,,,,,,1.0,,,,,,,,,,,,,1.0,,,,,


In [40]:
import re

df_education.columns = [x.replace(" ", "_") for x in df_education.columns]

In [41]:
df_education.head()

Unnamed: 0_level_0,Diğer_DOKTORA,Diğer_Diğer,Diğer_LİSANS,Diğer_YÜKSEK_LİSANS,Diğer_ÖNLİSANS,Diğer_Öğrenci,TheFirstLevel(TR)_DOKTORA,TheFirstLevel(TR)_Diğer,TheFirstLevel(TR)_LİSANS,TheFirstLevel(TR)_YÜKSEK_LİSANS,TheFirstLevel(TR)_ÖNLİSANS,TheFirstLevel(TR)_Öğrenci,TheFirstLevel(World)_DOKTORA,TheFirstLevel(World)_Diğer,TheFirstLevel(World)_LİSANS,TheFirstLevel(World)_YÜKSEK_LİSANS,TheFirstLevel(World)_Öğrenci,TheLowLevel(TR)_DOKTORA,TheLowLevel(TR)_Diğer,TheLowLevel(TR)_LİSANS,TheLowLevel(TR)_YÜKSEK_LİSANS,TheLowLevel(TR)_ÖNLİSANS,TheLowLevel(TR)_Öğrenci,TheSecondLevel(TR)_DOKTORA,TheSecondLevel(TR)_Diğer,TheSecondLevel(TR)_LİSANS,TheSecondLevel(TR)_YÜKSEK_LİSANS,TheSecondLevel(TR)_ÖNLİSANS,TheSecondLevel(TR)_Öğrenci,Avukat,Bilgisayar_Mühendisliği,Bilgisayar_Öğretmenliği,Diğer,Elektrik_ve_Elektronik_Mühendisliği,Endüstri_Mühendisliği,Fizik,Gıda_Mühendisliği,Kimya,Makine_Mühendisliği,Matematik_Mühendisliği,Mekatronik_Mühendisliği,Metalurji_ve_Malzeme_Mühendisliği,Mühendislik_Yönetimi,Uluslararası_İlişkiler,Yapay_Zeka,Yazılım_Mühendisliği,Yönetim_Bilişim_Sistemleri,İktisat,İnşaat_Mühendisliği,İstatistik,İşletme
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
1,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,
2,,,,,,,,1.0,,,,,,,,,,,,,,,,,1.0,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,
3,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,
4,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,
5,,,1.0,,,,,,,1.0,,,,,,,,,,,,,,,1.0,,,,,,,,1.0,,,,,,,,,,,,,1.0,,,,,


In [42]:
df_education = df_education.fillna(0)

In [43]:
df_education.head()

Unnamed: 0_level_0,Diğer_DOKTORA,Diğer_Diğer,Diğer_LİSANS,Diğer_YÜKSEK_LİSANS,Diğer_ÖNLİSANS,Diğer_Öğrenci,TheFirstLevel(TR)_DOKTORA,TheFirstLevel(TR)_Diğer,TheFirstLevel(TR)_LİSANS,TheFirstLevel(TR)_YÜKSEK_LİSANS,TheFirstLevel(TR)_ÖNLİSANS,TheFirstLevel(TR)_Öğrenci,TheFirstLevel(World)_DOKTORA,TheFirstLevel(World)_Diğer,TheFirstLevel(World)_LİSANS,TheFirstLevel(World)_YÜKSEK_LİSANS,TheFirstLevel(World)_Öğrenci,TheLowLevel(TR)_DOKTORA,TheLowLevel(TR)_Diğer,TheLowLevel(TR)_LİSANS,TheLowLevel(TR)_YÜKSEK_LİSANS,TheLowLevel(TR)_ÖNLİSANS,TheLowLevel(TR)_Öğrenci,TheSecondLevel(TR)_DOKTORA,TheSecondLevel(TR)_Diğer,TheSecondLevel(TR)_LİSANS,TheSecondLevel(TR)_YÜKSEK_LİSANS,TheSecondLevel(TR)_ÖNLİSANS,TheSecondLevel(TR)_Öğrenci,Avukat,Bilgisayar_Mühendisliği,Bilgisayar_Öğretmenliği,Diğer,Elektrik_ve_Elektronik_Mühendisliği,Endüstri_Mühendisliği,Fizik,Gıda_Mühendisliği,Kimya,Makine_Mühendisliği,Matematik_Mühendisliği,Mekatronik_Mühendisliği,Metalurji_ve_Malzeme_Mühendisliği,Mühendislik_Yönetimi,Uluslararası_İlişkiler,Yapay_Zeka,Yazılım_Mühendisliği,Yönetim_Bilişim_Sistemleri,İktisat,İnşaat_Mühendisliği,İstatistik,İşletme
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# cat_cols = ['fields_of_study', 'Degree_School']

In [45]:
"""def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

df_edu = one_hot_encoder(df_edu, cat_cols, drop_first=True)"""

'def one_hot_encoder(dataframe, categorical_cols, drop_first=False):\n    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)\n    return dataframe\n\ndf_edu = one_hot_encoder(df_edu, cat_cols, drop_first=True)'

### Skills Preprocessing

In [46]:
# df_skills = pd.read_csv(os.path.join(root, 'skills.csv'))

In [47]:
df_skills.head()

Unnamed: 0,user_id,skill
0,1,Mühendislik
1,1,Eğitim
2,2,Android
3,2,Java
4,2,3D Studio Max


In [48]:
df_skills['skill'].value_counts().head(20)

Java                    24360
JavaScript              24072
SQL                     23590
C#                      23422
HTML                    16872
CSS                     15109
C++                     14762
Microsoft SQL Server    14593
C                       14121
Microsoft Office        13854
Python                  13353
MySQL                   13320
Git                     12651
Linux                   11552
Software Development    10978
jQuery                  10497
ASP.NET                 10178
HTML5                    9380
.NET                     8963
ASP.NET MVC              8806
Name: skill, dtype: int64

In [49]:
df_skills.loc[df_skills['skill'].str.contains("HTML|CSS|JavaScript|Bootstrap|jQuery|AngularJS|React.js|Angular|JSP|AJAX|Front-end|frontend"), 'skill'] = "FRONTEND"
df_skills.loc[df_skills['skill'].str.contains("ASP|.NET|PHP|php|Java|java|Node.js|\#|Go|Spring Boot|Eclipse|backend"), 'skill'] = "BACKEND"
df_skills.loc[df_skills['skill'].str.contains("SQL|Database|MongoDB|Postgre|PL/|MySQL|Oracle|Veritabanı|Hibernate|Veritabanları|Big Data|veritabanı|Mongo"), 'skill'] = "DATABASE"
df_skills.loc[df_skills['skill'].str.contains("OOP|Object|Nesne"), 'skill'] = "OOP"
df_skills.loc[df_skills['skill'].str.contains("Teamwork|Ekip Çalışması|İletişim|Team Motivation"), 'skill'] = "TEAMWORKER"
df_skills.loc[df_skills['skill'].str.contains("Yazılım Geliştirme|Software Development|Jenkins|Software Design|Maven|UML|TFS|JIRA|DevOps"), 'skill'] = "SOFTWARE_DEVELOPMENT"
df_skills.loc[df_skills['skill'].str.contains("Agile|AGILE"), 'skill'] = "AGILE"
df_skills.loc[df_skills['skill'].str.contains("Excel|Office|Word|PowerPoint"), 'skill'] = "MICROSOFT_OFFICE"
df_skills.loc[df_skills['skill'].str.contains("WEB|Web"), 'skill'] = "WEB"
df_skills.loc[df_skills['skill'].str.contains("Management|Yönetim|Lider|Leadership"), 'skill'] = "MANAGEMENT"
df_skills.loc[df_skills['skill'].str.contains("Machine Learning|Veri Bilimi|Veri Analizi|Algorithms|Analysis|Makine Öğrenimi|Algoritmalar|Yapay Zeka|Artificial|Doğal Dil İşleme|İstatistik|Neural Networks"), 'skill'] = "ML"
df_skills.loc[df_skills['skill'].str.contains("Programlama|Programming"), 'skill'] = "PROGRAMMING"
df_skills.loc[df_skills['skill'].str.contains("Android|mobil|MOBILE|Mobile|Mobil Uygulamalar|Flutter|React Native"), 'skill'] = "ANDROID"
df_skills.loc[df_skills['skill'].str.contains("Framework|WCF|Django"), 'skill'] = "FRAMEWORK"
df_skills.loc[df_skills['skill'].str.contains("Unity|GAME|game"), 'skill'] = "GAME_DEV"
df_skills.loc[df_skills['skill'].str.contains("Araştırma|Research"), 'skill'] = "RESEARCH"
df_skills.loc[df_skills['skill'].str.contains("Mühendislik|Engineering"), 'skill'] = "ENGINEERING"
df_skills.loc[df_skills['skill'].str.contains("Embedded Systems|Microservices|AutoCAD|Arduino|SolidWorks|hardware|Donanım"), 'skill'] = "HARDWARE_SKILLS"
df_skills.loc[df_skills['skill'].str.contains("Problem Solving|Sorun Çözme"), 'skill'] = "PROBLEM_SOLVING"
df_skills.loc[df_skills['skill'].str.contains("Server|Tomcat|Docker|Redis|sunucu"), 'skill'] = "SERVER"
df_skills.loc[df_skills['skill'].str.contains("CLOUD|Cloud Computing|bulut|Kubernetes"), 'skill'] = "CLOUD"
df_skills.loc[df_skills['skill'].str.contains("PYTHON|Python|py|Pandas|Numpy"), 'skill'] = "PYTHON"
df_skills.loc[df_skills['skill'].str.contains("C+|cplusplus|c+"), 'skill'] = "C++"
df_skills.loc[df_skills['skill'].str.contains("Git|github|GITHUB"), 'skill'] = "GITHUB"
df_skills.loc[df_skills['skill'].str.contains("Linux|LINUX"), 'skill'] = "LINUX"
df_skills.loc[df_skills['skill'].str.contains("Photoshop|Photo|Adobe|DESIGN|design|illustrator|3D|Grafik|Tasarım"), 'skill'] = "DESIGNER"
df_skills.loc[df_skills['skill'].str.contains("Networking|network"), 'skill'] = "NETWORKING"
df_skills.loc[df_skills['skill'].str.contains("ECONOMY|PAYMENT|Payment|economy|ekonomi"), 'skill'] = "ECONOMY"
df_skills.loc[df_skills['skill'].str.contains("FINANCE|Finance|Finans|satış|finans|pazarlama|Sales"), 'skill'] = "FINANCE"
df_skills.loc[df_skills['skill'].str.contains("Testing|test|TEST|Manuel Test Etme"), 'skill'] = "TEST"
df_skills.loc[df_skills['skill'].str.contains("Siber Güvenlik|Siber|cybersecurity|security|5C|Güvenliği"), 'skill'] = "CYBERSECURITY"
df_skills.loc[df_skills['skill'].str.contains("iOS|ios|IOS"), 'skill'] = "IOS"
df_skills.loc[df_skills['skill'].str.contains("Marketing"), 'skill'] = "MARKETING"

In [50]:
df_skills.loc[~df_skills["skill"].isin(["FRONTEND","BACKEND","DATABASE","OOP","TEAMWORKER","SOFTWARE_DEVELOPMENT","AGILE", 
                                       "MICROSOFT_OFFICE","WEB","MANAGEMENT","ML","PROGRAMMING","ANDROID","FRAMEWORK","GAME_DEV", 
                                       "RESEARCH","ENGINEERING","HARDWARE_SKILLS","PROBLEM_SOLVING","SERVER","CLOUD","PYTHON","C++", 
                                       "GITHUB","LINUX","DESIGNER","NETWORKING","ECONOMY","FINANCE","TEST",
                                       "CYBERSECURITY","IOS","MARKETING"]), "skill"] = "OtherSkill"

In [51]:
df_skills['skill'].value_counts().head(20)

C++                     414653
OtherSkill              391764
FRONTEND                107985
DATABASE                100369
MANAGEMENT               54404
SOFTWARE_DEVELOPMENT     42031
WEB                      38253
ANDROID                  24423
ML                       24119
OOP                      23862
PROGRAMMING              17499
ENGINEERING              15465
GITHUB                   15370
FRAMEWORK                15353
PYTHON                   14642
LINUX                    14210
SERVER                   13152
DESIGNER                 12805
HARDWARE_SKILLS          11620
AGILE                     9605
Name: skill, dtype: int64

In [52]:
#df_skill.loc[df_skill['skill'].str.contains("Java|Ön|ön lisans|önlisans", na=False), 'skill'] = "Java"
# df_skill.loc[df_skill['skill'].str.contains("SQL|Doktor|Ph", na=False), 'skill'] = "SQL"
#df_skill.loc[df_skill['skill'].str.contains("C++|Master|MSc|MS|M.Sc.|MBA|Msc|M.Sc|M.S.|M.S", na=False), 'skill'] = "C#"
#df_skill.loc[df_skill['skill'].str.contains("Software", na=False), 'skill'] = "Yazılım"
#df_skill.loc[df_skill['skill'].str.contains("HTML5|Student", na=False), 'skill'] = "HTML"



# df_edu.loc[~df_edu["degree"].isin(["ÖNLİSANS","DOKTORA","YÜKSEK_LİSANS","LİSANS","Öğrenci"]), "degree"] = "Diğer"

In [53]:
df_skills['have'] = True
df_skills = df_skills.drop_duplicates(['user_id', 'skill'])
df_skills = pd.pivot(df_skills, index='user_id', columns='skill', values='have')
df_skills = df_skills.fillna(0).astype(int)
df_skills.head()

skill,AGILE,ANDROID,C++,CYBERSECURITY,DATABASE,DESIGNER,ECONOMY,ENGINEERING,FINANCE,FRAMEWORK,FRONTEND,GAME_DEV,GITHUB,HARDWARE_SKILLS,IOS,LINUX,MANAGEMENT,MARKETING,ML,NETWORKING,OOP,OtherSkill,PROBLEM_SOLVING,PROGRAMMING,PYTHON,SERVER,SOFTWARE_DEVELOPMENT,TEAMWORKER,TEST,WEB
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0


In [54]:
# df_skill['skill'].value_counts().iloc[:30].index # we only took most common 20 skills, you can increase it

# df_skill = df_skill[df_skill['skill'].isin(used_skills)]
# df_skill['experience'] = True

## Language Preprocessing

In [55]:
df_lang = df_lang[df_lang['language'].notnull() & df_lang['proficiency'].notnull()]

In [56]:
lang_rank = {
    'elementary': 1,
    'limited_working': 2,
    'professional_working': 3,
    'full_professional': 4,
    'native_or_bilingual': 5
}

df_lang.loc[:, 'proficiency'] = df_lang.loc[:, 'proficiency'].map(lang_rank)

In [57]:
df_lang.head(10)

Unnamed: 0,user_id,language,proficiency
0,8,İngilizce,4
1,8,Türkçe,5
2,8,Fransızca,1
4,11,Turkish,5
5,11,English,3
6,11,Kurdish,5
7,12,Türkçe,5
8,12,English,3
9,13,İngilizce,4
10,13,Almanca,2


In [58]:
df_lang = df_lang[df_lang['proficiency'] > 2]

In [59]:
# Büyük harf dönüşümü.
df_lang['language'] = df_lang.language.apply(lambda x: " ".join(x.upper() for x in x.split()))

In [60]:
# noktalama işaretlerinin ortadan kaldırılması;
df_lang['language'] = df_lang.language.str.replace('[^\w\s]', '',regex = True)

In [61]:
# sayıların ortadan kaldırılması;
df_lang['language'] = df_lang.language.str.replace('\d', '',regex = True)

In [62]:
# ifade = "Bu ifade İçerisinde bağzı TÜrkçe karakterler vardır"
duzeltilecek_harfler = "çÇğĞıİöÖşŞüÜ"
duzeltilmis_harfler = "cCgGiIoOsSuU"
alfabe_duzeltme = str.maketrans(duzeltilecek_harfler, duzeltilmis_harfler)
# ifade.translate(alfabe_duzeltme)

In [63]:
# harf düzeltme;
df_lang['language'] = df_lang.language.str.translate(alfabe_duzeltme)

In [64]:
# boşluk silme;
df_lang['language'] = df_lang.language.str.strip()

In [65]:
df_lang.language.value_counts().head(30)

INGILIZCE                  14587
TURKCE                     11923
ENGLISH                    11639
TURKISH                     7496
GERMAN                       341
ARABIC                       299
ALMANCA                      269
FRENCH                       199
INGILIZCE ORTA               120
RUSSIAN                      119
AZERBAIJANI                  115
FRANSIZCA                    108
PERSIAN                       79
ARAPCA                        71
RUSCA                         69
KURDISH                       60
AZERICE                       57
URDU                          56
INGILIZCE ESKI YAKLASIK       52
KURTCE                        47
SPANISH                       47
TURKCE OSMANLICA              45
ISPANYOLCA                    40
TURKMEN                       34
DEUTSCH                       33
ARMENIAN                      27
ALBANIAN                      23
ENGLISCH                      22
TURKISCH                      22
ITALIAN                       21
Name: lang

In [66]:
df_lang.loc[df_lang['language'].str.contains("ENG|ING"), 'language'] = "ENGLISH"
df_lang.loc[df_lang['language'].str.contains("AZƏ|AZE"), 'language'] = "AZERBAIJANI" 
df_lang.loc[df_lang['language'].str.contains("TUR|TUKCE"), 'language'] = "TURKISH"
df_lang.loc[df_lang['language'].str.contains("ALM|GER|DEU"), 'language'] = "GERMAN"
df_lang.loc[df_lang['language'].str.contains("FR"), 'language'] = "FRENCH"
df_lang.loc[df_lang['language'].str.contains("ESPA|ISP|SPA"), 'language'] = "SPANISH"
df_lang.loc[df_lang['language'].str.contains("RUS"), 'language'] = "RUSSIAN"
df_lang.loc[df_lang['language'].str.contains("ARA"), 'language'] = "ARABIC" 
df_lang.loc[df_lang['language'].str.contains("CHI|CIN"), 'language'] = "CHINESE"
df_lang.loc[df_lang['language'].str.contains("FAR|PERS"), 'language'] = "PERSIAN"
df_lang.loc[df_lang['language'].str.contains("JAP"), 'language'] = "JAPANESE"
df_lang.loc[df_lang['language'].str.contains("ITAL"), 'language'] = "ITALIAN"
df_lang.loc[df_lang['language'].str.contains("KUR"), 'language'] = "KURDISH"

df_lang.loc[~df_lang["language"].isin(["ENGLISH","AZERBAIJANI","TURKISH","GERMAN","FRENCH","SPANISH","RUSSIAN","ARABIC","CHINESE","PERSIAN","JAPANESE","ITALIAN","KURDISH"]), "language"] = "OtherLanguage"

In [67]:
df_lang.language.value_counts().head(30)

ENGLISH          26525
TURKISH          19558
GERMAN             654
OtherLanguage      551
ARABIC             374
FRENCH             321
RUSSIAN            189
AZERBAIJANI        179
KURDISH            112
PERSIAN             93
SPANISH             91
ITALIAN             41
JAPANESE            18
CHINESE             15
Name: language, dtype: int64

In [68]:
df_lang = df_lang.drop_duplicates(['user_id', 'language'])
df_lang = pd.pivot(df_lang, index='user_id', columns='language', values='proficiency')
df_lang = df_lang.fillna(0).astype(int)
df_lang.head()

language,ARABIC,AZERBAIJANI,CHINESE,ENGLISH,FRENCH,GERMAN,ITALIAN,JAPANESE,KURDISH,OtherLanguage,PERSIAN,RUSSIAN,SPANISH,TURKISH
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8,0,0,0,4,0,0,0,0,0,0,0,0,0,5
11,0,0,0,3,0,0,0,0,5,0,0,0,0,5
12,0,0,0,3,0,0,0,0,0,0,0,0,0,5
13,0,0,0,4,0,0,0,0,0,0,0,0,0,0
14,0,0,0,3,0,0,0,0,0,0,0,0,0,5


### Experiences

In [69]:
df_exp = df_exp.sort_values(by=['user_id', 'start_year_month']) # we want nth function to return the last companies by date
df_new = pd.DataFrame()

In [70]:
df_new['company(1th)'] = df_exp.groupby(idx)['company_id'].nth(-1).astype(str)
df_new['company(2th)'] = df_exp.groupby(idx)['company_id'].nth(-2).astype(str)
df_new['company(3th)'] = df_exp.groupby(idx)['company_id'].nth(-3).astype(str)

df_new['company_location(1th)'] = df_exp.groupby(idx)['location'].nth(-1).astype(str)
df_new['company_location(2th)'] = df_exp.groupby(idx)['location'].nth(-2).astype(str)
df_new['company_location(3th)'] = df_exp.groupby(idx)['location'].nth(-3).astype(str)

df_new['min_exp_time'] = df_exp.groupby(idx)['start_year_month'].min()
df_new['max_exp_time'] = df_exp.groupby(idx)['start_year_month'].max()

df_new['company_count_2018'] = df_exp[df_exp['start_year_month'].gt(201712)].groupby(idx).size()
df_new['company_count_2017'] = df_exp[df_exp['start_year_month'].gt(201612)].groupby(idx).size()
df_new['company_count_2016'] = df_exp[df_exp['start_year_month'].gt(201512)].groupby(idx).size()

In [71]:
df_exp = df_new
df_exp.head()

Unnamed: 0_level_0,company(1th),company(2th),company(3th),company_location(1th),company_location(2th),company_location(3th),min_exp_time,max_exp_time,company_count_2018,company_count_2017,company_count_2016
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,0.0,,Visual Studio Asp.Net Developer,Serbest Çalışmalar,,200509,200509,,,
2,10,7.0,9.0,"Mersin, Turkey","Elazig, Turkey","Elazig, Turkey",201612,201806,1.0,2.0,3.0
5,15,,,İstanbul,,,201706,201706,,1.0,1.0
7,20,21.0,21.0,Elazığ,Macedonia,"Istanbul, Turkey",201607,201812,1.0,3.0,4.0
10,26,26.0,27.0,istanbul,"Istanbul, Turkey","Elazig, Turkey",201308,201805,2.0,2.0,2.0


### Tabloların kontrolü

In [72]:
df_exp.head()

Unnamed: 0_level_0,company(1th),company(2th),company(3th),company_location(1th),company_location(2th),company_location(3th),min_exp_time,max_exp_time,company_count_2018,company_count_2017,company_count_2016
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,0.0,,Visual Studio Asp.Net Developer,Serbest Çalışmalar,,200509,200509,,,
2,10,7.0,9.0,"Mersin, Turkey","Elazig, Turkey","Elazig, Turkey",201612,201806,1.0,2.0,3.0
5,15,,,İstanbul,,,201706,201706,,1.0,1.0
7,20,21.0,21.0,Elazığ,Macedonia,"Istanbul, Turkey",201607,201812,1.0,3.0,4.0
10,26,26.0,27.0,istanbul,"Istanbul, Turkey","Elazig, Turkey",201308,201805,2.0,2.0,2.0


In [73]:
df_skills.head()

skill,AGILE,ANDROID,C++,CYBERSECURITY,DATABASE,DESIGNER,ECONOMY,ENGINEERING,FINANCE,FRAMEWORK,FRONTEND,GAME_DEV,GITHUB,HARDWARE_SKILLS,IOS,LINUX,MANAGEMENT,MARKETING,ML,NETWORKING,OOP,OtherSkill,PROBLEM_SOLVING,PROGRAMMING,PYTHON,SERVER,SOFTWARE_DEVELOPMENT,TEAMWORKER,TEST,WEB
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0


In [74]:
df_lang.head()

language,ARABIC,AZERBAIJANI,CHINESE,ENGLISH,FRENCH,GERMAN,ITALIAN,JAPANESE,KURDISH,OtherLanguage,PERSIAN,RUSSIAN,SPANISH,TURKISH
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8,0,0,0,4,0,0,0,0,0,0,0,0,0,5
11,0,0,0,3,0,0,0,0,5,0,0,0,0,5
12,0,0,0,3,0,0,0,0,0,0,0,0,0,5
13,0,0,0,4,0,0,0,0,0,0,0,0,0,0
14,0,0,0,3,0,0,0,0,0,0,0,0,0,5


In [75]:
df_education.head()

Unnamed: 0_level_0,Diğer_DOKTORA,Diğer_Diğer,Diğer_LİSANS,Diğer_YÜKSEK_LİSANS,Diğer_ÖNLİSANS,Diğer_Öğrenci,TheFirstLevel(TR)_DOKTORA,TheFirstLevel(TR)_Diğer,TheFirstLevel(TR)_LİSANS,TheFirstLevel(TR)_YÜKSEK_LİSANS,TheFirstLevel(TR)_ÖNLİSANS,TheFirstLevel(TR)_Öğrenci,TheFirstLevel(World)_DOKTORA,TheFirstLevel(World)_Diğer,TheFirstLevel(World)_LİSANS,TheFirstLevel(World)_YÜKSEK_LİSANS,TheFirstLevel(World)_Öğrenci,TheLowLevel(TR)_DOKTORA,TheLowLevel(TR)_Diğer,TheLowLevel(TR)_LİSANS,TheLowLevel(TR)_YÜKSEK_LİSANS,TheLowLevel(TR)_ÖNLİSANS,TheLowLevel(TR)_Öğrenci,TheSecondLevel(TR)_DOKTORA,TheSecondLevel(TR)_Diğer,TheSecondLevel(TR)_LİSANS,TheSecondLevel(TR)_YÜKSEK_LİSANS,TheSecondLevel(TR)_ÖNLİSANS,TheSecondLevel(TR)_Öğrenci,Avukat,Bilgisayar_Mühendisliği,Bilgisayar_Öğretmenliği,Diğer,Elektrik_ve_Elektronik_Mühendisliği,Endüstri_Mühendisliği,Fizik,Gıda_Mühendisliği,Kimya,Makine_Mühendisliği,Matematik_Mühendisliği,Mekatronik_Mühendisliği,Metalurji_ve_Malzeme_Mühendisliği,Mühendislik_Yönetimi,Uluslararası_İlişkiler,Yapay_Zeka,Yazılım_Mühendisliği,Yönetim_Bilişim_Sistemleri,İktisat,İnşaat_Mühendisliği,İstatistik,İşletme
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [76]:
#df_education = df_education.loc[:,~df_education.columns.duplicated()]

In [77]:
#df_education.reset_index(inplace = True)

In [78]:
#df_education = df_education[[c for c in df_education.columns if not c.endswith('_delete_suffix')]]

In [79]:
# df_education.set_index(idx)

In [80]:
df_train[df_education.columns]   = df_education[df_education.columns]
df_train[df_lang.columns]  = df_lang[df_lang.columns]
df_train[df_skills.columns] = df_skills[df_skills.columns]
df_train[df_exp.columns]   = df_exp[df_exp.columns]

In [81]:
df_test[df_education.columns]   = df_education[df_education.columns]
df_test[df_lang.columns]  = df_lang[df_lang.columns]
df_test[df_skills.columns] = df_skills[df_skills.columns]
df_test[df_exp.columns]   = df_exp[df_exp.columns]

In [82]:
df_train.head()

Unnamed: 0_level_0,industry,location,moved_after_2019,Diğer_DOKTORA,Diğer_Diğer,Diğer_LİSANS,Diğer_YÜKSEK_LİSANS,Diğer_ÖNLİSANS,Diğer_Öğrenci,TheFirstLevel(TR)_DOKTORA,TheFirstLevel(TR)_Diğer,TheFirstLevel(TR)_LİSANS,TheFirstLevel(TR)_YÜKSEK_LİSANS,TheFirstLevel(TR)_ÖNLİSANS,TheFirstLevel(TR)_Öğrenci,TheFirstLevel(World)_DOKTORA,TheFirstLevel(World)_Diğer,TheFirstLevel(World)_LİSANS,TheFirstLevel(World)_YÜKSEK_LİSANS,TheFirstLevel(World)_Öğrenci,TheLowLevel(TR)_DOKTORA,TheLowLevel(TR)_Diğer,TheLowLevel(TR)_LİSANS,TheLowLevel(TR)_YÜKSEK_LİSANS,TheLowLevel(TR)_ÖNLİSANS,TheLowLevel(TR)_Öğrenci,TheSecondLevel(TR)_DOKTORA,TheSecondLevel(TR)_Diğer,TheSecondLevel(TR)_LİSANS,TheSecondLevel(TR)_YÜKSEK_LİSANS,TheSecondLevel(TR)_ÖNLİSANS,TheSecondLevel(TR)_Öğrenci,Avukat,Bilgisayar_Mühendisliği,Bilgisayar_Öğretmenliği,Diğer,Elektrik_ve_Elektronik_Mühendisliği,Endüstri_Mühendisliği,Fizik,Gıda_Mühendisliği,Kimya,Makine_Mühendisliği,Matematik_Mühendisliği,Mekatronik_Mühendisliği,Metalurji_ve_Malzeme_Mühendisliği,Mühendislik_Yönetimi,Uluslararası_İlişkiler,Yapay_Zeka,Yazılım_Mühendisliği,Yönetim_Bilişim_Sistemleri,İktisat,İnşaat_Mühendisliği,İstatistik,İşletme,ARABIC,AZERBAIJANI,CHINESE,ENGLISH,FRENCH,GERMAN,ITALIAN,JAPANESE,KURDISH,OtherLanguage,PERSIAN,RUSSIAN,SPANISH,TURKISH,AGILE,ANDROID,C++,CYBERSECURITY,DATABASE,DESIGNER,ECONOMY,ENGINEERING,FINANCE,FRAMEWORK,FRONTEND,GAME_DEV,GITHUB,HARDWARE_SKILLS,IOS,LINUX,MANAGEMENT,MARKETING,ML,NETWORKING,OOP,OtherSkill,PROBLEM_SOLVING,PROGRAMMING,PYTHON,SERVER,SOFTWARE_DEVELOPMENT,TEAMWORKER,TEST,WEB,company(1th),company(2th),company(3th),company_location(1th),company_location(2th),company_location(3th),min_exp_time,max_exp_time,company_count_2018,company_count_2017,company_count_2016
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1
1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,26,1875,1876,,"Izmir, Turkey","Izmir, Turkey",201505.0,201803.0,1.0,1.0,1.0
6950,Internet,"Istanbul, Istanbul, Turkey",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1337,1337,1337,"Istanbul, Turkey",,"Istanbul, Turkey",201304.0,201702.0,,1.0,3.0
4880,Online Media,Turkey,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4366,5291,5292,"Istanbul, Turkey",,,201401.0,201705.0,,1.0,2.0
26046,Telecommunications,"Istanbul, Istanbul, Turkey",0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,26,26,26,"Istanbul, Turkey",,,200909.0,201410.0,,,
11005,Banking,"Istanbul, Turkey",0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1562,7191,151,"İstanbul, Türkiye",,,200909.0,201704.0,,1.0,1.0


## Modelleme

In [83]:
df_all = pd.concat([df_train, df_test], axis=0)

In [84]:
cat_cols = [col for col in df_test.columns if df_all[col].dtype == 'object']
num_cols = [col for col in df_test.columns if df_all[col].dtype != 'object']

In [85]:
for col in cat_cols:
    df_all[col] = df_all[col].factorize()[0]

In [86]:
df_all[cat_cols] = df_all[cat_cols].astype('category')
df_all[num_cols] = df_all[num_cols].fillna(0)

In [87]:
df_train = df_all.loc[df_train.index, df_train.columns]
df_test = df_all.loc[df_test.index, df_test.columns]

In [88]:
df_train.head()

Unnamed: 0_level_0,industry,location,moved_after_2019,Diğer_DOKTORA,Diğer_Diğer,Diğer_LİSANS,Diğer_YÜKSEK_LİSANS,Diğer_ÖNLİSANS,Diğer_Öğrenci,TheFirstLevel(TR)_DOKTORA,TheFirstLevel(TR)_Diğer,TheFirstLevel(TR)_LİSANS,TheFirstLevel(TR)_YÜKSEK_LİSANS,TheFirstLevel(TR)_ÖNLİSANS,TheFirstLevel(TR)_Öğrenci,TheFirstLevel(World)_DOKTORA,TheFirstLevel(World)_Diğer,TheFirstLevel(World)_LİSANS,TheFirstLevel(World)_YÜKSEK_LİSANS,TheFirstLevel(World)_Öğrenci,TheLowLevel(TR)_DOKTORA,TheLowLevel(TR)_Diğer,TheLowLevel(TR)_LİSANS,TheLowLevel(TR)_YÜKSEK_LİSANS,TheLowLevel(TR)_ÖNLİSANS,TheLowLevel(TR)_Öğrenci,TheSecondLevel(TR)_DOKTORA,TheSecondLevel(TR)_Diğer,TheSecondLevel(TR)_LİSANS,TheSecondLevel(TR)_YÜKSEK_LİSANS,TheSecondLevel(TR)_ÖNLİSANS,TheSecondLevel(TR)_Öğrenci,Avukat,Bilgisayar_Mühendisliği,Bilgisayar_Öğretmenliği,Diğer,Elektrik_ve_Elektronik_Mühendisliği,Endüstri_Mühendisliği,Fizik,Gıda_Mühendisliği,Kimya,Makine_Mühendisliği,Matematik_Mühendisliği,Mekatronik_Mühendisliği,Metalurji_ve_Malzeme_Mühendisliği,Mühendislik_Yönetimi,Uluslararası_İlişkiler,Yapay_Zeka,Yazılım_Mühendisliği,Yönetim_Bilişim_Sistemleri,İktisat,İnşaat_Mühendisliği,İstatistik,İşletme,ARABIC,AZERBAIJANI,CHINESE,ENGLISH,FRENCH,GERMAN,ITALIAN,JAPANESE,KURDISH,OtherLanguage,PERSIAN,RUSSIAN,SPANISH,TURKISH,AGILE,ANDROID,C++,CYBERSECURITY,DATABASE,DESIGNER,ECONOMY,ENGINEERING,FINANCE,FRAMEWORK,FRONTEND,GAME_DEV,GITHUB,HARDWARE_SKILLS,IOS,LINUX,MANAGEMENT,MARKETING,ML,NETWORKING,OOP,OtherSkill,PROBLEM_SOLVING,PROGRAMMING,PYTHON,SERVER,SOFTWARE_DEVELOPMENT,TEAMWORKER,TEST,WEB,company(1th),company(2th),company(3th),company_location(1th),company_location(2th),company_location(3th),min_exp_time,max_exp_time,company_count_2018,company_count_2017,company_count_2016
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1
1301,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,201505.0,201803.0,1.0,1.0,1.0
6950,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1,1,1,1,1,1,201304.0,201702.0,0.0,1.0,3.0
4880,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,2,2,1,1,2,201401.0,201705.0,0.0,1.0,2.0
26046,3,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,3,3,1,1,2,200909.0,201410.0,0.0,0.0,0.0
11005,4,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3,4,4,2,1,2,200909.0,201704.0,0.0,1.0,1.0


In [89]:
df_test.head()

Unnamed: 0_level_0,industry,location,Diğer_DOKTORA,Diğer_Diğer,Diğer_LİSANS,Diğer_YÜKSEK_LİSANS,Diğer_ÖNLİSANS,Diğer_Öğrenci,TheFirstLevel(TR)_DOKTORA,TheFirstLevel(TR)_Diğer,TheFirstLevel(TR)_LİSANS,TheFirstLevel(TR)_YÜKSEK_LİSANS,TheFirstLevel(TR)_ÖNLİSANS,TheFirstLevel(TR)_Öğrenci,TheFirstLevel(World)_DOKTORA,TheFirstLevel(World)_Diğer,TheFirstLevel(World)_LİSANS,TheFirstLevel(World)_YÜKSEK_LİSANS,TheFirstLevel(World)_Öğrenci,TheLowLevel(TR)_DOKTORA,TheLowLevel(TR)_Diğer,TheLowLevel(TR)_LİSANS,TheLowLevel(TR)_YÜKSEK_LİSANS,TheLowLevel(TR)_ÖNLİSANS,TheLowLevel(TR)_Öğrenci,TheSecondLevel(TR)_DOKTORA,TheSecondLevel(TR)_Diğer,TheSecondLevel(TR)_LİSANS,TheSecondLevel(TR)_YÜKSEK_LİSANS,TheSecondLevel(TR)_ÖNLİSANS,TheSecondLevel(TR)_Öğrenci,Avukat,Bilgisayar_Mühendisliği,Bilgisayar_Öğretmenliği,Diğer,Elektrik_ve_Elektronik_Mühendisliği,Endüstri_Mühendisliği,Fizik,Gıda_Mühendisliği,Kimya,Makine_Mühendisliği,Matematik_Mühendisliği,Mekatronik_Mühendisliği,Metalurji_ve_Malzeme_Mühendisliği,Mühendislik_Yönetimi,Uluslararası_İlişkiler,Yapay_Zeka,Yazılım_Mühendisliği,Yönetim_Bilişim_Sistemleri,İktisat,İnşaat_Mühendisliği,İstatistik,İşletme,ARABIC,AZERBAIJANI,CHINESE,ENGLISH,FRENCH,GERMAN,ITALIAN,JAPANESE,KURDISH,OtherLanguage,PERSIAN,RUSSIAN,SPANISH,TURKISH,AGILE,ANDROID,C++,CYBERSECURITY,DATABASE,DESIGNER,ECONOMY,ENGINEERING,FINANCE,FRAMEWORK,FRONTEND,GAME_DEV,GITHUB,HARDWARE_SKILLS,IOS,LINUX,MANAGEMENT,MARKETING,ML,NETWORKING,OOP,OtherSkill,PROBLEM_SOLVING,PROGRAMMING,PYTHON,SERVER,SOFTWARE_DEVELOPMENT,TEAMWORKER,TEST,WEB,company(1th),company(2th),company(3th),company_location(1th),company_location(2th),company_location(3th),min_exp_time,max_exp_time,company_count_2018,company_count_2017,company_count_2016
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1
17449,54,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,135,708,522,0,32,1,200101.0,201201.0,0.0,0.0,0.0
33967,5,0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,7614,-1,1,1,-1,201606.0,201806.0,1.0,1.0,2.0
2110,25,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,-1,-1,-1,-1,0.0,0.0,0.0,0.0,0.0
55082,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4717,3657,-1,165,5,-1,201108.0,201702.0,0.0,1.0,1.0
37165,9,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9080,-1,-1,600,-1,-1,201606.0,201606.0,0.0,0.0,1.0


In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier

In [91]:
# Base Models
models = [('LR', LogisticRegression()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier()),
          ('Adaboost', AdaBoostClassifier()),
          ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
          ("LightGBM", LGBMClassifier()),
          ("CatBoost", CatBoostClassifier(verbose=False))]

In [92]:
X, y = df_train.drop(columns=[target]), df_train[target]

In [93]:
# X = df_train.drop(columns=['user_id','moved_after_2019'])

In [94]:
# y

In [95]:
# X

In [96]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate

In [97]:

for name, model in models:
    cv_results = cross_validate(model, X, y, cv=10, scoring=["accuracy", "f1", "roc_auc", "precision", "recall"])
    print(f"########## {name} ##########")
    print(f"Accuracy: {round(cv_results['test_accuracy'].mean(), 4)}")
    print(f"Auc: {round(cv_results['test_roc_auc'].mean(), 4)}")
    print(f"Recall: {round(cv_results['test_recall'].mean(), 4)}")
    print(f"Precision: {round(cv_results['test_precision'].mean(), 4)}")
    print(f"F1: {round(cv_results['test_f1'].mean(), 4)}")


########## LR ##########
Accuracy: 0.6131
Auc: 0.549
Recall: 0.0081
Precision: 0.494
F1: 0.0159
########## CART ##########
Accuracy: 0.7303
Auc: 0.7171
Recall: 0.658
Precision: 0.6498
F1: 0.6538
########## RF ##########
Accuracy: 0.7813
Auc: 0.8653
Recall: 0.5839
Precision: 0.7969
F1: 0.6739
########## Adaboost ##########
Accuracy: 0.6491
Auc: 0.6594
Recall: 0.3037
Precision: 0.5911
F1: 0.4011
########## XGBoost ##########
Accuracy: nan
Auc: nan
Recall: nan
Precision: nan
F1: nan
########## LightGBM ##########
Accuracy: 0.7068
Auc: 0.7553
Recall: 0.4818
Precision: 0.6684
F1: 0.5599
########## CatBoost ##########
Accuracy: nan
Auc: nan
Recall: nan
Precision: nan
F1: nan


In [98]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [99]:
X, y = df_train.drop(columns=[target]), df_train[target]
clf = RandomForestClassifier()
cv  = KFold()

scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')

In [100]:
print(f'Scores mean: {np.mean(scores):.4f}')
print(f'Scores std: {np.std(scores):.4f}')

Scores mean: 0.7738
Scores std: 0.0032


In [101]:
# target = ['moved_after_2019','user_id']

In [102]:
# df_test.drop(columns=["user_id"], inplace = True)

In [103]:
#cols = ['user_id']
#set(df_test .columns).issuperset(cols)

In [104]:
#df_test.columns = df_test.columns.str.strip()

In [105]:
#df_test.columns = df_test.columns.to_series().replace({r'\s+': ' ', r'_+': '_', r'—': '-'}, regex=True)

In [106]:
#df_test.head()

In [107]:
#df_test.reset_index(inplace = True)

In [108]:
#df_test = df_test.set_index(idx)

In [109]:
#df_test

In [110]:
clf.fit(X, y)
df_subm.loc[df_test.index, target] = clf.predict(df_test)

df_subm[target] = df_subm[target].astype(int)
df_subm[target].value_counts()

0    9363
1    3892
Name: moved_after_2019, dtype: int64

In [111]:
df_subm.to_csv('submission2.csv')

In [112]:
# view the feature scores

feature_scores = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)

feature_scores.head(20)

min_exp_time                0.067
company(1th)                0.058
location                    0.057
industry                    0.057
max_exp_time                0.052
company(2th)                0.043
company_location(1th)       0.036
company(3th)                0.031
company_location(2th)       0.028
company_location(3th)       0.022
ENGLISH                     0.022
company_count_2016          0.016
Diğer                       0.015
TheSecondLevel(TR)_LİSANS   0.014
MANAGEMENT                  0.014
company_count_2017          0.014
SOFTWARE_DEVELOPMENT        0.013
WEB                         0.013
FRONTEND                    0.013
Bilgisayar_Mühendisliği     0.013
dtype: float64

In [113]:
# df_train[df_train['moved_after_2019'] > 0].value_counts()