In [16]:
#!pip install pyforest

# 1-Import Libraies
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline
import statsmodels.api as sm
import statsmodels.formula.api as smf
import missingno as msno 

from sklearn.compose import make_column_transformer

#Scaling
from sklearn.preprocessing import scale 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler


#Importing plotly and cufflinks in offline mode
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

#Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

#Figure&Display options
plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('max_colwidth',200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#!pip install termcolor
import colorama
from colorama import Fore, Style  # maakes strings colored
from termcolor import colored

import ipywidgets
from ipywidgets import interact

In [17]:
## Useful Functions

###############################################################################

def missing(df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values

def missing_values(df):
    return missing(df)[missing(df)['Missing_Number']>0]

###############################################################################

def first_looking(df):
    print(colored("Shape:", attrs=['bold']), df.shape,'\n',
          f"There is ", df.shape[0], " observation and ", df.shape[1], " columns in the dataset.", '\n',
          colored('-'*79, 'red', attrs=['bold']),
          colored("\nInfo:\n", attrs=['bold']), sep='')
    print(df.info(), '\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("Number of Uniques:\n", attrs=['bold']), df.nunique(),'\n',
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("Missing Values:\n", attrs=['bold']), missing_values(df),'\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("All Columns:", attrs=['bold']), list(df.columns),'\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')

    df.columns= df.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')

    print(colored("Columns after rename:", attrs=['bold']), list(df.columns),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')

def duplicate_values(df):
    duplicate_values = df.duplicated(subset=None, keep='first').sum()
    if duplicate_values > 0:
        df.drop_duplicates(keep='first', inplace=True)
        print(duplicate_values, colored("duplicates were dropped", attrs=['bold']),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
    else:
        print(colored("No duplicates", attrs=['bold']),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
        
def drop_columns(df, drop_columns):
    if drop_columns !=[]:
        df.drop(drop_columns, axis=1, inplace=True)
        print(drop_columns, 'were dropped')
    else:
        print(colored('We will now check the missing values and if necessary drop some columns!!!', attrs=['bold']),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
        
def drop_null(df, limit):
    print('Shape:', df.shape)
    for i in df.isnull().sum().index:
        if (df.isnull().sum()[i]/df.shape[0]*100)>limit:
            print(df.isnull().sum()[i], 'percent of', i ,'null and were dropped')
            df.drop(i, axis=1, inplace=True)
            print('new shape:', df.shape)
        else:
            print(df.isnull().sum()[i], '%, percentage of missing values of', i ,'less than limit', limit, '%, so we will keep it.')
    print('New shape after missing value control:', df.shape)

###############################################################################

    FINAL_CUSTOMER_DATATHON.csv
    Index(['unnamed:_0', 'base_customer_id', 'customer_id', 'gender', 'gender_id',
           'marital_status', 'marital_status_id', 'birth_date',
           'fk_address_communication_city', 'occupation'], dtype='object')    

    FINAL_CUSTOMER_RELATED_TABLE_FOR_DATATHON.csv
    Index(['unnamed:_0', 'base_customer_id', 'vehicle_id', 'start_date',
           'end_date', 'fk_relation_status_id', 'fk_relation_status_explanation'], dtype='object')
          
    FINAL_SALES_FILE_DATATHON.csv
    Index(['unnamed:_0', 'customer_id', 'salesfile_id', 'sf_create_date', 'status',
           'req_brand_code', 'req_topmodel_code'], dtype='object')
          
    FINAL_SIFIR_ARAC_ALANLAR_DATATHON.csv
    Index(['unnamed:_0', 'vehicle_id', 'traffic_date', 'brand_code',
           'basemodel_code', 'topmodel_code', 'motor_gas_type', 'gear_box_type'], dtype='object')
          
    FINAL_VEHICLE_TABLE_DATATHON.csv
    Index(['unnamed:_0', 'vehicle_id', 'customer_id', 'create_date'], dtype='object') 
    
    MASK_SERVIS_BAKIM_DATATHON_FINAL.csv
    Index(['unnamed:_0', 'create_date', 'is_maintenance', 'vehicle_id',
           'total_amount_tl'], dtype='object')
          
    sample_submission.csv
    Index(['id', 'expected'], dtype='object')    

In [259]:
dataset_path1 = "D_FINAL_CUSTOMER_DATATHON.csv"
df1_ = pd.read_csv(dataset_path1)
dataset_path2 = "D_FINAL_CUSTOMER_RELATED_TABLE_FOR_DATATHON.csv"
df2_ = pd.read_csv(dataset_path2)
dataset_path3 = "D_FINAL_SALES_FILE_DATATHON.csv"
df3_ = pd.read_csv(dataset_path3)
dataset_path4 = "D_FINAL_SIFIR_ARAC_ALANLAR_DATATHON.csv"
df4_ = pd.read_csv(dataset_path4)
dataset_path5 = "D_FINAL_VEHICLE_TABLE_DATATHON.csv"
df5_ = pd.read_csv(dataset_path5)
dataset_path6 = "D_MASK_SERVIS_BAKIM_DATATHON_FINAL.csv"
df6_ = pd.read_csv(dataset_path6)
dataset_path7 = "D_sample_submission.csv"
df7_ = pd.read_csv(dataset_path7)

In [19]:
# # lets check the Summary Statistics for each of the Crops

# @interact
# def summary(df_list):
#     first_looking(df)
#     duplicate_values(df)
#     drop_columns(df,[])
#     drop_null(df, 90)

In [20]:
# 2-Load|Read Data
dataset_path = "D_FINAL_CUSTOMER_DATATHON.csv"
df0 = pd.read_csv(dataset_path)
df = df0.copy() 
first_looking(df)
duplicate_values(df)
drop_columns(df,[])
drop_null(df, 90)
# df.head()
# df.tail()
# df.sample(5)
# df.describe().T
# df.describe(include=object).T

[1mShape:[0m(849919, 10)
There is 849919 observation and 10 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 849919 entries, 0 to 849918
Data columns (total 10 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Unnamed: 0                     849919 non-null  int64  
 1   BASE_CUSTOMER_ID               849919 non-null  int64  
 2   CUSTOMER_ID                    849919 non-null  int64  
 3   GENDER                         849916 non-null  object 
 4   GENDER_ID                      849919 non-null  int64  
 5   MARITAL_STATUS                 620802 non-null  object 
 6   MARITAL_STATUS_ID              849919 non-null  int64  
 7   BIRTH_DATE                     645447 non-null  float64
 8   FK_ADDRESS_COMMUNICATION_CITY  730341 non-null  object 
 9   OCCUPATION         

In [21]:
# 2-Load|Read Data
dataset_path = "D_FINAL_CUSTOMER_RELATED_TABLE_FOR_DATATHON.csv"
df0 = pd.read_csv(dataset_path)
df = df0.copy() 
first_looking(df)
duplicate_values(df)
drop_columns(df,[])
drop_null(df, 90)
# df.head()
# df.tail()
# df.sample(5)
# df.describe().T
# df.describe(include=object).T

[1mShape:[0m(374879, 7)
There is 374879 observation and 7 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374879 entries, 0 to 374878
Data columns (total 7 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   Unnamed: 0                      374879 non-null  int64 
 1   BASE_CUSTOMER_ID                374879 non-null  int64 
 2   VEHICLE_ID                      374879 non-null  int64 
 3   START_DATE                      374879 non-null  object
 4   END_DATE                        159757 non-null  object
 5   FK_RELATION_STATUS_ID           374879 non-null  int64 
 6   FK_RELATION_STATUS_EXPLANATION  374879 non-null  object
dtypes: int64(4), object(3)
memory usage: 20.0+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m


In [22]:
# 2-Load|Read Data
dataset_path = "D_FINAL_SALES_FILE_DATATHON.csv"
df0 = pd.read_csv(dataset_path)
df = df0.copy() 
first_looking(df)
duplicate_values(df)
drop_columns(df,[])
drop_null(df, 90)
# df.head()
# df.tail()
# df.sample(5)
# df.describe().T
# df.describe(include=object).T

[1mShape:[0m(397526, 7)
There is 397526 observation and 7 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397526 entries, 0 to 397525
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         397526 non-null  int64  
 1   CUSTOMER_ID        397526 non-null  int64  
 2   SALESFILE_ID       397526 non-null  float64
 3   SF_CREATE_DATE     397526 non-null  object 
 4   STATUS             397526 non-null  int64  
 5   REQ_BRAND_CODE     397526 non-null  object 
 6   REQ_TOPMODEL_CODE  397237 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 21.2+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mUnnamed: 0           397526
CUSTOMER_ID          183523
SALESFILE_ID 

In [23]:
# 2-Load|Read Data
dataset_path = "D_FINAL_SIFIR_ARAC_ALANLAR_DATATHON.csv"
df0 = pd.read_csv(dataset_path)
df = df0.copy() 
first_looking(df)
duplicate_values(df)
drop_columns(df,[])
drop_null(df, 90)
# df.head()
# df.tail()
# df.sample(5)
# df.describe().T
# df.describe(include=object).T

[1mShape:[0m(169525, 4)
There is 169525 observation and 4 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169525 entries, 0 to 169524
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   169525 non-null  int64 
 1   VEHICLE_ID   169525 non-null  int64 
 2   CUSTOMER_ID  169525 non-null  int64 
 3   CREATE_DATE  169525 non-null  object
dtypes: int64(3), object(1)
memory usage: 5.2+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mUnnamed: 0     169525
VEHICLE_ID     169521
CUSTOMER_ID    138432
CREATE_DATE    151242
dtype: int64
[1m[31m-------------------------------------------------------------------------------[0m
[1mMissing Values:
[0mEmpty DataFrame
Columns: [Missing_Number, Missing_Per

In [24]:
# 2-Load|Read Data
dataset_path = "D_FINAL_VEHICLE_TABLE_DATATHON.csv"
df0 = pd.read_csv(dataset_path)
df = df0.copy() 
first_looking(df)
duplicate_values(df)
drop_columns(df,[])
drop_null(df, 90)
# df.head()
# df.tail()
# df.sample(5)
# df.describe().T
# df.describe(include=object).T

[1mShape:[0m(345092, 8)
There is 345092 observation and 8 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 345092 entries, 0 to 345091
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      345092 non-null  int64 
 1   VEHICLE_ID      345092 non-null  int64 
 2   TRAFFIC_DATE    344226 non-null  object
 3   BRAND_CODE      345092 non-null  object
 4   BASEMODEL_CODE  345092 non-null  int64 
 5   TOPMODEL_CODE   345092 non-null  int64 
 6   MOTOR_GAS_TYPE  272646 non-null  object
 7   GEAR_BOX_TYPE   261077 non-null  object
dtypes: int64(4), object(4)
memory usage: 21.1+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mUnnamed: 0        345092
VEHICLE_ID        345092
TRAFFIC_DATE        74

In [25]:
# 2-Load|Read Data
dataset_path = "D_MASK_SERVIS_BAKIM_DATATHON_FINAL.csv"
df0 = pd.read_csv(dataset_path)
df = df0.copy() 
first_looking(df)
duplicate_values(df)
drop_columns(df,[])
drop_null(df, 90)
# df.head()
# df.tail()
# df.sample(5)
# df.describe().T
# df.describe(include=object).T

[1mShape:[0m(1041000, 5)
There is 1041000 observation and 5 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041000 entries, 0 to 1040999
Data columns (total 5 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   Unnamed: 0       1041000 non-null  int64  
 1   CREATE_DATE      1041000 non-null  object 
 2   IS_MAINTENANCE   1041000 non-null  int64  
 3   VEHICLE_ID       1041000 non-null  int64  
 4   TOTAL_AMOUNT_TL  965596 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 39.7+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mUnnamed: 0         1041000
CREATE_DATE        1033883
IS_MAINTENANCE           2
VEHICLE_ID          225447
TOTAL_AMOUNT_TL      16006
dtype: int64
[1m[31m-------------

In [26]:
# 2-Load|Read Data
dataset_path = "D_sample_submission.csv"
df0 = pd.read_csv(dataset_path)
df = df0.copy() 
first_looking(df)
duplicate_values(df)
drop_columns(df,[])
drop_null(df, 90)
# df.head()
# df.tail()
# df.sample(5)
# df.describe().T
# df.describe(include=object).T

[1mShape:[0m(166943, 2)
There is 166943 observation and 2 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166943 entries, 0 to 166942
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Id        166943 non-null  int64  
 1   Expected  0 non-null       float64
dtypes: float64(1), int64(1)
memory usage: 2.5 MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mId          166943
Expected         0
dtype: int64
[1m[31m-------------------------------------------------------------------------------[0m
[1mMissing Values:
[0m          Missing_Number  Missing_Percent
Expected          166943            1.000
[1m[31m-------------------------------------------------------------------------------[0m
[1mAll Columns:[

In [27]:
print(df1_.columns)
print(df2_.columns)
print(df3_.columns)
print(df4_.columns)
print(df5_.columns)
print(df6_.columns)
print(df7_.columns)

Index(['Unnamed: 0', 'BASE_CUSTOMER_ID', 'CUSTOMER_ID', 'GENDER', 'GENDER_ID',
       'MARITAL_STATUS', 'MARITAL_STATUS_ID', 'BIRTH_DATE',
       'FK_ADDRESS_COMMUNICATION_CITY', 'OCCUPATION'],
      dtype='object')
Index(['Unnamed: 0', 'BASE_CUSTOMER_ID', 'VEHICLE_ID', 'START_DATE',
       'END_DATE', 'FK_RELATION_STATUS_ID', 'FK_RELATION_STATUS_EXPLANATION'],
      dtype='object')
Index(['Unnamed: 0', 'CUSTOMER_ID', 'SALESFILE_ID', 'SF_CREATE_DATE', 'STATUS',
       'REQ_BRAND_CODE', 'REQ_TOPMODEL_CODE'],
      dtype='object')
Index(['Unnamed: 0', 'VEHICLE_ID', 'CUSTOMER_ID', 'CREATE_DATE'], dtype='object')
Index(['Unnamed: 0', 'VEHICLE_ID', 'TRAFFIC_DATE', 'BRAND_CODE',
       'BASEMODEL_CODE', 'TOPMODEL_CODE', 'MOTOR_GAS_TYPE', 'GEAR_BOX_TYPE'],
      dtype='object')
Index(['Unnamed: 0', 'CREATE_DATE', 'IS_MAINTENANCE', 'VEHICLE_ID',
       'TOTAL_AMOUNT_TL'],
      dtype='object')
Index(['Id', 'Expected'], dtype='object')


In [28]:
print(df1_["Unnamed: 0"])
print(df1_.shape)
print(df2_["Unnamed: 0"])
print(df2_.shape)
print(df3_["Unnamed: 0"])
print(df3_.shape)
print(df4_["Unnamed: 0"])
print(df4_.shape)
print(df5_["Unnamed: 0"])
print(df5_.shape)
print(df6_["Unnamed: 0"])
print(df6_.shape)
print(df7_.shape)

0              0
1              1
2              2
3              3
4              4
           ...  
849914    849914
849915    849915
849916    849916
849917    849917
849918    849918
Name: Unnamed: 0, Length: 849919, dtype: int64
(849919, 10)
0              0
1              1
2              2
3              3
4              4
           ...  
374874    374874
374875    374875
374876    374876
374877    374877
374878    374878
Name: Unnamed: 0, Length: 374879, dtype: int64
(374879, 7)
0              0
1              1
2              2
3              3
4              4
           ...  
397521    397521
397522    397522
397523    397523
397524    397524
397525    397525
Name: Unnamed: 0, Length: 397526, dtype: int64
(397526, 7)
0              0
1              1
2              2
3              3
4              4
           ...  
169520    169520
169521    169521
169522    169522
169523    169523
169524    169524
Name: Unnamed: 0, Length: 169525, dtype: int64
(169525, 4)
0              

In [29]:
# drop_columns(df1_, 'Unnamed: 0')
# drop_columns(df2_, 'Unnamed: 0')
# drop_columns(df3_, 'Unnamed: 0')
# drop_columns(df4_, 'Unnamed: 0')
# drop_columns(df5_, 'Unnamed: 0')
# drop_columns(df6_, 'Unnamed: 0')

In [30]:
df1_.columns= df1_.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')
df2_.columns= df2_.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')
df3_.columns= df3_.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')
df4_.columns= df4_.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')
df5_.columns= df5_.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')
df6_.columns= df6_.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')
df7_.columns= df7_.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')

In [31]:
# print(df1_.columns)
# print(df2_.columns)
# print(df3_.columns)
# print(df4_.columns)
# print(df5_.columns)
# print(df6_.columns)
# print(df7_.columns)

In [32]:
df7_.rename({'id': 'base_customer_id'}, axis=1, inplace=True)
df7_

Unnamed: 0,base_customer_id,expected
0,168254,
1,124747,
2,135493,
3,90543,
4,40421,
...,...,...
166938,53945,
166939,98842,
166940,55766,
166941,22680,


In [33]:
drop_columns(df7_, "expected")
# expected bizim tahminleri yazacagimiz kolon suan icin dusuyoruz

expected were dropped


In [34]:
df7_.head(2)

Unnamed: 0,base_customer_id
0,168254
1,124747


In [35]:
df1_.head(2)

Unnamed: 0,unnamed:_0,base_customer_id,customer_id,gender,gender_id,marital_status,marital_status_id,birth_date,fk_address_communication_city,occupation
0,0,158891,1747700,Erkek,100,,0,,,
1,1,169233,1710561,Erkek,100,Evli,101,1962.0,Erzurum,Öğretmen / Eğitmen


In [36]:
drop_columns(df1_, "unnamed:_0")
# unnamed:_0 indexlerden olusuyor sadece, dusuyoruz

unnamed:_0 were dropped


In [37]:
df1_

Unnamed: 0,base_customer_id,customer_id,gender,gender_id,marital_status,marital_status_id,birth_date,fk_address_communication_city,occupation
0,158891,1747700,Erkek,100,,0,,,
1,169233,1710561,Erkek,100,Evli,101,1962.000,Erzurum,Öğretmen / Eğitmen
2,30887,1043923,Erkek,100,Evli,101,1971.000,İstanbul,Esnaf
3,38013,1101926,Erkek,100,,0,,Manisa,
4,157091,1819787,Kadın,101,,0,,Ankara,
...,...,...,...,...,...,...,...,...,...
849914,18175,1744051,Erkek,100,,0,,,
849915,108604,1209486,Erkek,100,Evli,101,1969.000,İstanbul,Nakliyeci / Lojistik / Taşımacılık
849916,155926,1670812,Erkek,100,Evli,101,1970.000,İstanbul,
849917,123223,1691689,Erkek,100,Evli,101,1970.000,Kayseri,


In [38]:
df1_[["gender", "gender_id", 
     "marital_status","marital_status_id" ]].value_counts()

gender  gender_id  marital_status  marital_status_id
Erkek   100        Evli            101                  447463
                   Bekar           102                   77143
Kadın   101        Evli            101                   65748
                   Bekar           102                   30448
dtype: int64

In [39]:
df1_["gender_id"].value_counts(dropna=False)

100    731411
101    118505
0           3
Name: gender_id, dtype: int64

In [40]:
df1_[df1_["gender_id"]==0]
# gender ID 0 olanlarin ikisi ayni kisi(158309) ve gender bilgisi girilmemis

Unnamed: 0,base_customer_id,customer_id,gender,gender_id,marital_status,marital_status_id,birth_date,fk_address_communication_city,occupation
54223,138147,1808549,,0,,0,,,
368071,158309,1468162,,0,,0,,Giresun,
689191,158309,1468161,,0,,0,,Giresun,


In [41]:
df1_[df1_["base_customer_id"]==158309]
# bu kisinin cinsiyeti henuz tespit edilemedigi icin nan kalacak

Unnamed: 0,base_customer_id,customer_id,gender,gender_id,marital_status,marital_status_id,birth_date,fk_address_communication_city,occupation
368071,158309,1468162,,0,,0,,Giresun,
689191,158309,1468161,,0,,0,,Giresun,


In [42]:
# df1_[df1_["base_customer_id"]==138147]
# # 138147a gender girilmemis ancak erkek oldugu goruldu 

In [43]:
df1_.loc[(df1_["base_customer_id"]==138147), "gender"]="Erkek"
# 138147a gender girilmemis  erkek oldugu goruldu erkek olarak atandi

In [44]:
df1_[df1_["base_customer_id"]==138147]

Unnamed: 0,base_customer_id,customer_id,gender,gender_id,marital_status,marital_status_id,birth_date,fk_address_communication_city,occupation
54223,138147,1808549,Erkek,0,,0,,,
125775,138147,1823829,Erkek,100,Evli,101,1981.0,Isparta,Serbest
312332,138147,1448732,Erkek,100,Evli,101,1981.0,Isparta,Esnaf
325514,138147,1822401,Erkek,100,Evli,101,1981.0,Isparta,Serbest
370518,138147,1822818,Erkek,100,Evli,101,1981.0,Isparta,Esnaf
485788,138147,1392445,Erkek,100,Evli,101,1981.0,Burdur,
641593,138147,1801401,Erkek,100,,0,,Isparta,
690934,138147,1392446,Erkek,100,Evli,101,1981.0,Isparta,Serbest
845285,138147,1747889,Erkek,100,Evli,101,1981.0,Isparta,Oto. Bayisi / Oto. Galerisi


In [45]:
df1_.loc[(df1_["base_customer_id"]==138147), "gender_id"]=100
# 138147a gender girilmemis  erkek oldugu goruldu erkek olarak atandi

In [46]:
df1_[df1_["base_customer_id"]==138147]

Unnamed: 0,base_customer_id,customer_id,gender,gender_id,marital_status,marital_status_id,birth_date,fk_address_communication_city,occupation
54223,138147,1808549,Erkek,100,,0,,,
125775,138147,1823829,Erkek,100,Evli,101,1981.0,Isparta,Serbest
312332,138147,1448732,Erkek,100,Evli,101,1981.0,Isparta,Esnaf
325514,138147,1822401,Erkek,100,Evli,101,1981.0,Isparta,Serbest
370518,138147,1822818,Erkek,100,Evli,101,1981.0,Isparta,Esnaf
485788,138147,1392445,Erkek,100,Evli,101,1981.0,Burdur,
641593,138147,1801401,Erkek,100,,0,,Isparta,
690934,138147,1392446,Erkek,100,Evli,101,1981.0,Isparta,Serbest
845285,138147,1747889,Erkek,100,Evli,101,1981.0,Isparta,Oto. Bayisi / Oto. Galerisi


In [47]:
df1_.shape

(849919, 9)

In [48]:
# ((df1_["gender_id"]==100)&(df1_["gender"]=="Erkek")).value_counts()

In [49]:
## ((df1_["gender_id"]==101)&(df1_["gender"]=="Kadın")).value_counts()
# 101 olanlarin kadin oldugu teyit edildi
# 100 olanlarin erkek oldugu teyit edildi
# dolayisi ile gender idyi dusuyoruz

In [50]:
drop_columns(df1_, "gender_id")
# gender IDyi dustuk ancak genderda ik adet NAN kaldi simdilik

gender_id were dropped


In [51]:
df1_.head(2)

Unnamed: 0,base_customer_id,customer_id,gender,marital_status,marital_status_id,birth_date,fk_address_communication_city,occupation
0,158891,1747700,Erkek,,0,,,
1,169233,1710561,Erkek,Evli,101,1962.0,Erzurum,Öğretmen / Eğitmen


In [52]:
df1_["marital_status_id"].value_counts(dropna=False)

101    513211
0      229117
102    107591
Name: marital_status_id, dtype: int64

In [53]:
df1_["marital_status"].value_counts(dropna=False)

Evli     513211
NaN      229117
Bekar    107591
Name: marital_status, dtype: int64

In [54]:
((df1_["marital_status_id"]==101)&(df1_["marital_status"]=="Evli")).value_counts()

True     513211
False    336708
dtype: int64

In [55]:
((df1_["marital_status_id"]==102)&(df1_["marital_status"]=="Bekar")).value_counts()
# 101 olanlarin Evli oldugu teyit edildi
# 102 olanlarin Bekar oldugu teyit edildi
# 0 olanlarin NaN oldugu teyit edildi
# dolayisi ile marital_status_idyi dusuyoruz

False    742328
True     107591
dtype: int64

In [56]:
drop_columns(df1_, "marital_status_id")
# marital_status_idyi dustuk ancak genderda ik adet NAN kaldi simdilik

marital_status_id were dropped


In [57]:
df1_.head(2)

Unnamed: 0,base_customer_id,customer_id,gender,marital_status,birth_date,fk_address_communication_city,occupation
0,158891,1747700,Erkek,,,,
1,169233,1710561,Erkek,Evli,1962.0,Erzurum,Öğretmen / Eğitmen


In [58]:
df1_.shape

(849919, 7)

In [59]:
df7_.head(2)

Unnamed: 0,base_customer_id
0,168254
1,124747


In [60]:
df7_.shape

(166943, 1)

In [61]:
df_ = df7_.merge(df1_, on="base_customer_id", how='left')

df_.head(2)

Unnamed: 0,base_customer_id,customer_id,gender,marital_status,birth_date,fk_address_communication_city,occupation
0,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,
1,168254,1663507,Erkek,Bekar,1988.0,Şanlıurfa,


In [62]:
df_.shape

(714249, 7)

In [63]:
849919-714249
# df1deki 135670 adet veri girisi bizim sample_submission yapacagimiz kisilere ait degil

135670

In [64]:
df2_["fk_relation_status_explanation"].value_counts()

Aktif Ruhsat Sahibi    215122
Pasif Ruhsat Sahibi    159757
Name: fk_relation_status_explanation, dtype: int64

In [65]:
df2__ = df2_[df2_["fk_relation_status_explanation"]=="Aktif Ruhsat Sahibi"]["base_customer_id"].unique()

In [66]:
len(df2__)

166943

In [67]:
np.sort(df2__)

array([     1,      2,      3, ..., 196559, 196560, 196563], dtype=int64)

In [68]:
df7_["base_customer_id"].sort_values().head(3)

40570     1
131267    2
68238     3
Name: base_customer_id, dtype: int64

In [69]:
df7_["base_customer_id"].sort_values().tail(3)

141974    196559
92600     196560
148887    196563
Name: base_customer_id, dtype: int64

In [70]:
len(df7_)

166943

In [71]:
(np.sort(df2__)==df7_["base_customer_id"].sort_values()).value_counts()
# df7_ ile df2_ deki BCIDler birebir ayni, sorun yok; df2_'de 166943 unique BCID ve 
# bu kisilere ait toplam 215122 adet aktif ruhsatli arac ve bu araclarin VehicleIDleri var, 
# ayrica yine bu kisilere ait 309000-215122 adet Pasif Ruhsatli arac ve bu araclarin VehicleIDleri var.

True    166943
Name: base_customer_id, dtype: int64

In [72]:
df2_[df2_["fk_relation_status_explanation"]=="Aktif Ruhsat Sahibi"][["base_customer_id","vehicle_id"]]
# df2_ 166943 unique BCID ve 
# bu kisilere ait toplam 215122 adet aktif ruhsatli arac ve VehicleIDleri var 
# en eonemli kitle hem kapali hem acik ruhsatli arac bilgisi olan bu kisiler olabilir

Unnamed: 0,base_customer_id,vehicle_id
1,110802,5329278
2,160615,5329282
3,115664,5328513
4,81061,5322264
5,127152,5322205
...,...,...
374870,168053,5179442
374874,154304,5179455
374875,75199,5180458
374876,9036,5177679


In [73]:
df2_["fk_relation_status_explanation"].value_counts(dropna=False)

Aktif Ruhsat Sahibi    215122
Pasif Ruhsat Sahibi    159757
Name: fk_relation_status_explanation, dtype: int64

In [74]:
df2_.shape

(374879, 7)

In [75]:
df2_["base_customer_id"].nunique()

196538

In [76]:
# df2_.groupby("base_customer_id")["fk_relation_status_explanation"].describe(include=object)

In [77]:
df2_[df2_["fk_relation_status_explanation"]=="Aktif Ruhsat Sahibi"]

Unnamed: 0,unnamed:_0,base_customer_id,vehicle_id,start_date,end_date,fk_relation_status_id,fk_relation_status_explanation
1,1,110802,5329278,2019-07-16,,1,Aktif Ruhsat Sahibi
2,2,160615,5329282,2019-09-12,,1,Aktif Ruhsat Sahibi
3,3,115664,5328513,2019-02-25,,1,Aktif Ruhsat Sahibi
4,4,81061,5322264,2018-04-16,,1,Aktif Ruhsat Sahibi
5,5,127152,5322205,2021-04-12,,1,Aktif Ruhsat Sahibi
...,...,...,...,...,...,...,...
374870,374870,168053,5179442,2013-06-25,,1,Aktif Ruhsat Sahibi
374874,374874,154304,5179455,2010-10-19,,1,Aktif Ruhsat Sahibi
374875,374875,75199,5180458,2019-01-02,,1,Aktif Ruhsat Sahibi
374876,374876,9036,5177679,2012-05-28,,1,Aktif Ruhsat Sahibi


In [78]:
df2_[df2_["fk_relation_status_explanation"]=="Pasif Ruhsat Sahibi"]

Unnamed: 0,unnamed:_0,base_customer_id,vehicle_id,start_date,end_date,fk_relation_status_id,fk_relation_status_explanation
0,0,189537,5321560,2018-07-24,2019-07-03,2,Pasif Ruhsat Sahibi
6,6,153835,5322205,2018-06-05,2019-06-21,2,Pasif Ruhsat Sahibi
10,10,164581,5321370,2020-04-29,2020-08-07,2,Pasif Ruhsat Sahibi
11,11,111788,5321488,2018-02-07,2019-02-26,2,Pasif Ruhsat Sahibi
16,16,154311,5326120,2018-10-31,2019-12-11,2,Pasif Ruhsat Sahibi
...,...,...,...,...,...,...,...
374869,374869,168053,5179442,2010-09-14,2011-11-24,2,Pasif Ruhsat Sahibi
374871,374871,93551,5188087,2019-06-26,2019-06-29,2,Pasif Ruhsat Sahibi
374872,374872,157467,5188087,2013-07-22,2014-06-12,2,Pasif Ruhsat Sahibi
374873,374873,133546,5178752,2014-01-31,2014-03-19,2,Pasif Ruhsat Sahibi


In [79]:
df2_aktifID = df2_[df2_["fk_relation_status_explanation"]=="Aktif Ruhsat Sahibi"]["base_customer_id"]
df2_aktifID

1         110802
2         160615
3         115664
4          81061
5         127152
           ...  
374870    168053
374874    154304
374875     75199
374876      9036
374877     33936
Name: base_customer_id, Length: 215122, dtype: int64

In [80]:
df2_pasifID = df2_[df2_["fk_relation_status_explanation"]=="Pasif Ruhsat Sahibi"]["base_customer_id"]
df2_pasifID 

0         189537
6         153835
10        164581
11        111788
16        154311
           ...  
374869    168053
374871     93551
374872    157467
374873    133546
374878     91144
Name: base_customer_id, Length: 159757, dtype: int64

In [81]:
df_diff = pd.concat([df2_aktifID , df2_pasifID ]).drop_duplicates(keep=False)
df_diff

1         110802
5         127152
8          59160
9         159897
15        177011
           ...  
374781     78413
374826     61332
374838    119487
374866    163801
374871     93551
Name: base_customer_id, Length: 106685, dtype: int64

In [82]:
df2_["fk_relation_status_explanation"].value_counts(dropna=False)

Aktif Ruhsat Sahibi    215122
Pasif Ruhsat Sahibi    159757
Name: fk_relation_status_explanation, dtype: int64

In [83]:
df2_["base_customer_id"].nunique()

196538

In [84]:
df2_.shape

(374879, 7)

In [85]:
df2_target_customer = pd.merge(df2_aktifID, df2_pasifID, how='inner')
df2_target_customer

Unnamed: 0,base_customer_id
0,160615
1,160615
2,160615
3,160615
4,160615
...,...
410450,33936
410451,33936
410452,33936
410453,33936


In [86]:
df2_target_customer.nunique()

base_customer_id    56162
dtype: int64

In [87]:
len(set(df2_target_customer["base_customer_id"]))
# hem pasif hem de aktif araci olan bizim potansiyel arac alacagini dusundugumuz kitlenin base customer idileri

56162

In [88]:
df2_[df2_["base_customer_id"]==185023]

Unnamed: 0,unnamed:_0,base_customer_id,vehicle_id,start_date,end_date,fk_relation_status_id,fk_relation_status_explanation
1174,1174,185023,5251358,2015-08-17,2015-10-19,2,Pasif Ruhsat Sahibi
2009,2009,185023,5181366,2014-11-26,,1,Aktif Ruhsat Sahibi
2230,2230,185023,5073008,2014-05-13,,1,Aktif Ruhsat Sahibi
3076,3076,185023,5186431,2014-03-28,,1,Aktif Ruhsat Sahibi
3443,3443,185023,5071744,2015-01-26,2017-09-22,2,Pasif Ruhsat Sahibi
3763,3763,185023,5037241,2014-11-20,,1,Aktif Ruhsat Sahibi
6248,6248,185023,5037598,2013-06-12,,1,Aktif Ruhsat Sahibi
7685,7685,185023,5210677,2014-09-09,2020-03-23,2,Pasif Ruhsat Sahibi
7809,7809,185023,5173981,2013-05-30,,1,Aktif Ruhsat Sahibi
8659,8659,185023,5088327,2015-12-03,,1,Aktif Ruhsat Sahibi


In [89]:
df2_["fk_relation_status_explanation"].value_counts(dropna=False)

Aktif Ruhsat Sahibi    215122
Pasif Ruhsat Sahibi    159757
Name: fk_relation_status_explanation, dtype: int64

In [90]:
df2_[df2_["fk_relation_status_explanation"]=="Aktif Ruhsat Sahibi"]["base_customer_id"].nunique()

166943

In [91]:
df2_[df2_["fk_relation_status_explanation"]=="Pasif Ruhsat Sahibi"]["base_customer_id"].nunique()

85757

In [92]:
df_.head(2)

Unnamed: 0,base_customer_id,customer_id,gender,marital_status,birth_date,fk_address_communication_city,occupation
0,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,
1,168254,1663507,Erkek,Bekar,1988.0,Şanlıurfa,


In [93]:
df2_.head(2)

Unnamed: 0,unnamed:_0,base_customer_id,vehicle_id,start_date,end_date,fk_relation_status_id,fk_relation_status_explanation
0,0,189537,5321560,2018-07-24,2019-07-03,2,Pasif Ruhsat Sahibi
1,1,110802,5329278,2019-07-16,,1,Aktif Ruhsat Sahibi


In [94]:
df2_["fk_relation_status_id"].value_counts(dropna=False)

1    215122
2    159757
Name: fk_relation_status_id, dtype: int64

In [95]:
((df2_["fk_relation_status_id"]==1)&(df2_["fk_relation_status_explanation"]=="Aktif Ruhsat Sahibi")).value_counts()

True     215122
False    159757
dtype: int64

In [96]:
((df2_["fk_relation_status_id"]==2)&(df2_["fk_relation_status_explanation"]=="Pasif Ruhsat Sahibi")).value_counts()
# 1 olanlarin Aktif Ruhsat Sahibi oldugu teyit edildi
# 2 olanlarin Pasif Ruhsat Sahibi oldugu teyit edildi
# dolayisi ile fk_relation_status_idyi dusuyoruz
# unnamed:_0 da dusuyoruz

False    215122
True     159757
dtype: int64

In [97]:
drop_columns(df2_, ["fk_relation_status_id", "unnamed:_0"])

['fk_relation_status_id', 'unnamed:_0'] were dropped


In [98]:
df2_.head(2)

Unnamed: 0,base_customer_id,vehicle_id,start_date,end_date,fk_relation_status_explanation
0,189537,5321560,2018-07-24,2019-07-03,Pasif Ruhsat Sahibi
1,110802,5329278,2019-07-16,,Aktif Ruhsat Sahibi


In [99]:
df_.shape
# df7_ ve df1_ join edilmisti suana kadar

(714249, 7)

In [100]:
df2_.shape

(374879, 5)

In [101]:
df_.head(2)

Unnamed: 0,base_customer_id,customer_id,gender,marital_status,birth_date,fk_address_communication_city,occupation
0,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,
1,168254,1663507,Erkek,Bekar,1988.0,Şanlıurfa,


In [102]:
df_ = df_.merge(df2_, on="base_customer_id", how='left')
df_.head(2)

Unnamed: 0,base_customer_id,customer_id,gender,marital_status,birth_date,fk_address_communication_city,occupation,vehicle_id,start_date,end_date,fk_relation_status_explanation
0,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,,5329232,2019-09-20,,Aktif Ruhsat Sahibi
1,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,,5183621,2015-09-14,2019-07-05,Pasif Ruhsat Sahibi


In [103]:
df_["base_customer_id"].nunique()

166943

In [104]:
df3_.head(2)

Unnamed: 0,unnamed:_0,customer_id,salesfile_id,sf_create_date,status,req_brand_code,req_topmodel_code
0,0,1712792,7274467.0,2017-11-17 12:15:39,101,Y,9231.0
1,1,1149729,4839430.0,2015-11-11 17:03:28,102,K,9184.0


In [105]:
df3_.shape

(397526, 7)

In [106]:
df4_.head()

Unnamed: 0,unnamed:_0,vehicle_id,customer_id,create_date
0,0,5015193,1063740,2005-01-01 19:08:33
1,1,5110231,1155678,2005-09-29 17:16:15
2,2,5290356,1737831,2016-04-29 13:12:10
3,3,5344576,1841362,2021-03-16 13:39:01
4,4,5094010,1066916,2005-04-20 09:51:19


In [107]:
df4_.shape

(169525, 4)

In [108]:
df5_.head()

Unnamed: 0,unnamed:_0,vehicle_id,traffic_date,brand_code,basemodel_code,topmodel_code,motor_gas_type,gear_box_type
0,0,5317713,2017-12-07,Z,13398,9349,Benzin,Manuel
1,1,5259212,2014-12-09,X,15694,9524,Dizel,Otomatik (DSG)
2,2,5110414,2005-12-12,Y,11183,9118,Kurşunsuz Benzin,Otomatik
3,3,5101263,2005-08-23,X,15039,9495,Benzin,Manuel
4,4,5053909,2003-04-09,Y,12255,9219,,


In [109]:
df5_.shape

(345092, 8)

In [110]:
df6_.head()

Unnamed: 0,unnamed:_0,create_date,is_maintenance,vehicle_id,total_amount_tl
0,0,2010-12-29 10:50:56,1,5000001,192.0
1,1,2013-08-21 10:34:21,1,5000001,267.0
2,2,2015-11-30 10:55:00,1,5000001,1747.0
3,3,2015-12-09 13:38:49,0,5000001,465.0
4,4,2017-11-23 09:53:22,1,5000001,549.0


In [111]:
df6_.shape

(1041000, 5)

In [112]:
df6_vehicle = df6_["vehicle_id"]

In [113]:
df4_vehicle = df4_["vehicle_id"]

In [114]:
df_diff = pd.concat([df2_vehicle, df4_vehicle]).drop_duplicates(keep=False)
df_diff

NameError: name 'df2_vehicle' is not defined

In [None]:
df_diff = pd.concat([df2_vehicle, df5_vehicle]).drop_duplicates(keep=False)
df_diff

In [None]:
df5_vehicle = df5_["vehicle_id"]

In [115]:
df2_vehicle.shape

NameError: name 'df2_vehicle' is not defined

In [116]:
df6_vehicle.shape

(1041000,)

In [117]:
df6_vehicle.nunique()

225447

In [118]:
df2_vehicle.nunique()

NameError: name 'df2_vehicle' is not defined

In [119]:
df_diff = pd.concat([df2_vehicle, df6_vehicle]).drop_duplicates(keep=False)

NameError: name 'df2_vehicle' is not defined

In [120]:
df_diff

1         110802
5         127152
8          59160
9         159897
15        177011
           ...  
374781     78413
374826     61332
374838    119487
374866    163801
374871     93551
Name: base_customer_id, Length: 106685, dtype: int64

In [121]:
df2_

Unnamed: 0,base_customer_id,vehicle_id,start_date,end_date,fk_relation_status_explanation
0,189537,5321560,2018-07-24,2019-07-03,Pasif Ruhsat Sahibi
1,110802,5329278,2019-07-16,,Aktif Ruhsat Sahibi
2,160615,5329282,2019-09-12,,Aktif Ruhsat Sahibi
3,115664,5328513,2019-02-25,,Aktif Ruhsat Sahibi
4,81061,5322264,2018-04-16,,Aktif Ruhsat Sahibi
...,...,...,...,...,...
374874,154304,5179455,2010-10-19,,Aktif Ruhsat Sahibi
374875,75199,5180458,2019-01-02,,Aktif Ruhsat Sahibi
374876,9036,5177679,2012-05-28,,Aktif Ruhsat Sahibi
374877,33936,5172864,2014-08-16,,Aktif Ruhsat Sahibi


In [122]:
df6_.shape

(1041000, 5)

In [123]:
df_vehicle=df_["vehicle_id"]

In [124]:
df_diff = pd.concat([df_vehicle, df6_vehicle]).drop_duplicates(keep=False)
df_diff

9619       5018539
10335      5004562
10355      5005050
11569      5018325
14039      5111440
            ...   
1040162    5341922
1040278    5342204
1040389    5342626
1040508    5342903
1040971    5344784
Name: vehicle_id, Length: 9808, dtype: int64

In [125]:
df_ = df_.merge(df2_, on = "base_customer_id", how='left')
df_["base_customer_id"].nunique()
# df7_+df_1(BaseCustID)+df3_(CustomerID)

166943

In [126]:
df2_aktif = df2_[df2_["fk_relation_status_explanation"]=="Aktif Ruhsat Sahibi"]["base_customer_id"]
df2_aktif

1         110802
2         160615
3         115664
4          81061
5         127152
           ...  
374870    168053
374874    154304
374875     75199
374876      9036
374877     33936
Name: base_customer_id, Length: 215122, dtype: int64

In [127]:
df2_pasif = df2_[df2_["fk_relation_status_explanation"]=="Pasif Ruhsat Sahibi"]["base_customer_id"]
df2_pasif 

0         189537
6         153835
10        164581
11        111788
16        154311
           ...  
374869    168053
374871     93551
374872    157467
374873    133546
374878     91144
Name: base_customer_id, Length: 159757, dtype: int64

In [128]:
df2_.shape

(374879, 5)

In [129]:
df_diff = pd.concat([df2_pasif, df2_aktif]).drop_duplicates(keep=False)
df_diff

131        55526
175       194513
183       183147
188       192492
196       180052
           ...  
374834    104036
374842    155760
374858     22680
374863    124954
374865    167802
Name: base_customer_id, Length: 106685, dtype: int64

In [130]:
159757-106685

53072

In [131]:
df_.shape

(28440948, 15)

In [132]:
df_.head()

Unnamed: 0,base_customer_id,customer_id,gender,marital_status,birth_date,fk_address_communication_city,occupation,vehicle_id_x,start_date_x,end_date_x,fk_relation_status_explanation_x,vehicle_id_y,start_date_y,end_date_y,fk_relation_status_explanation_y
0,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,,5329232,2019-09-20,,Aktif Ruhsat Sahibi,5329232,2019-09-20,,Aktif Ruhsat Sahibi
1,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,,5329232,2019-09-20,,Aktif Ruhsat Sahibi,5183621,2015-09-14,2019-07-05,Pasif Ruhsat Sahibi
2,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,,5329232,2019-09-20,,Aktif Ruhsat Sahibi,5208488,2012-09-15,2014-02-19,Pasif Ruhsat Sahibi
3,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,,5329232,2019-09-20,,Aktif Ruhsat Sahibi,5310923,2018-07-19,2019-08-16,Pasif Ruhsat Sahibi
4,168254,1507435,Erkek,Bekar,1988.0,Şanlıurfa,,5329232,2019-09-20,,Aktif Ruhsat Sahibi,5311914,2019-03-29,2021-02-10,Pasif Ruhsat Sahibi


In [133]:
df_ = df_.merge(df6_, on = "vehicle_id", how='left')
df_["base_customer_id"].nunique()
# df7_+df_1(BaseCustID)+df3_(CustomerID)

KeyError: 'vehicle_id'

In [None]:
df_.shape

In [None]:
df_.head()

In [None]:
df_["vehicle_id"].nunique()

In [None]:
df_[(df_["base_customer_id"].notnull())&(df_["vehicle_id"].isnull())]["base_customer_id"]

In [None]:
df_vehicle = df_["vehicle_id"]

In [None]:
df_diff = pd.concat([df_vehicle, df2_vehicle]).drop_duplicates(keep=False)
df_diff

In [None]:
df_[df_["vehicle_id"]==5124447]

In [None]:
df6_[df6_["vehicle_id"]==5124447]

In [None]:
df6_[df6_["vehicle_id"]==5095487]

In [None]:
df6_[df6_["vehicle_id"]==5009220]

In [None]:
df6_[df6_["vehicle_id"]==5251191]

In [None]:
df2_vehicle[df2_vehicle==5251191]

In [None]:
df_[df_["vehicle_id"]==5251191]

In [None]:
df_["fk_relation_status_explanation"].value_counts()

In [None]:
df_vehicle.nunique()

In [None]:
df2_vehicle.nunique()

In [None]:
df2_.info()

In [None]:
len(df_)

In [None]:
df_[["base_customer_id","vehicle_id"]].value_counts()

In [None]:
df2_vehicle

In [None]:
df2_.shape

In [None]:
df4_.head(2)

In [None]:
df4_.shape

In [None]:
df_ = df_.merge(df2_, on = "base_customer_id", how='left')
df_["base_customer_id"].nunique()
# df7_+df_1(BaseCustID)+df3_(CustomerID)

In [None]:
drop_columns(df_, ["expected", "unnamed:_0_x", "unnamed:_0_y"])

In [None]:
df_.head()

In [None]:
df_ = df_.merge(df3_, on = "customer_id", how='left')
df_["base_customer_id"].nunique()

In [None]:
df_.shape

In [None]:
df4_.head()

In [None]:
df_ = df_.merge(df4_, on = "customer_id", how='left')
df_["base_customer_id"].nunique()

In [None]:
df.head(2)

In [None]:
df_.head(2)

In [None]:
df1_.head(2)

In [None]:
df2_.head(2)

In [None]:
df3_.head(2)

In [None]:
df4_.head(2)

In [None]:
df5_.head(2)

In [None]:
df6_.head(2)

In [None]:
df7_.head(2)

In [260]:
df6_.head(1)

Unnamed: 0.1,Unnamed: 0,CREATE_DATE,IS_MAINTENANCE,VEHICLE_ID,TOTAL_AMOUNT_TL
0,0,2010-12-29 10:50:56,1,5000001,192.0


In [261]:
df6_.columns= df6_.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')

In [262]:
df6_.head(1)

Unnamed: 0,unnamed:_0,create_date,is_maintenance,vehicle_id,total_amount_tl
0,0,2010-12-29 10:50:56,1,5000001,192.0


In [263]:
df6_.drop("unnamed:_0", axis=1, inplace=True)

In [264]:
df6_.head(1)

Unnamed: 0,create_date,is_maintenance,vehicle_id,total_amount_tl
0,2010-12-29 10:50:56,1,5000001,192.0


In [266]:
df6_.groupby("vehicle_id")["total_amount_tl"].mean()

vehicle_id
5000001    644.000
5000003    943.000
5000007   1689.000
5000008        NaN
5000013    154.500
            ...   
5344976   1180.000
5344981      0.000
5344999   3258.000
5345037      1.000
5345048    604.000
Name: total_amount_tl, Length: 225447, dtype: float64

In [267]:
aa = pd.DataFrame(df6_.groupby("vehicle_id")["total_amount_tl"].sum())

In [268]:
aa.reset_index(inplace=True)

In [270]:
aa.head(1)

Unnamed: 0,vehicle_id,total_amount_tl
0,5000001,3220.0


In [271]:
aa.columns

Index(['vehicle_id', 'total_amount_tl'], dtype='object')

In [272]:
len(aa)

225447

In [351]:
bb = df6_[["vehicle_id", "is_maintenance"]]

In [352]:
bb["bakım"]=bb["is_maintenance"]

In [353]:
bb.head(1)

Unnamed: 0,vehicle_id,is_maintenance,bakım
0,5000001,1,1


In [354]:
bb.rename({"is_maintenance":"servis"}, axis=1, inplace=True)

In [355]:
bb.head(1)

Unnamed: 0,vehicle_id,servis,bakım
0,5000001,1,1


In [356]:
bb["bakım"] = bb["bakım"].replace(0,2)

In [359]:
bb["bakım"] = bb["bakım"].replace(1,0)

In [361]:
bb["bakım"] = bb["bakım"].replace(2,1)

In [362]:
bb.head(5)

Unnamed: 0,vehicle_id,servis,bakım
0,5000001,1,0
1,5000001,1,0
2,5000001,1,0
3,5000001,0,1
4,5000001,1,0


In [364]:
cc = pd.DataFrame(bb.groupby("vehicle_id")["servis"].sum())

In [368]:
cc.reset_index(inplace=True)

In [369]:
cc.head(1)

Unnamed: 0,vehicle_id,servis
0,5000001,4


In [366]:
dd = pd.DataFrame(bb.groupby("vehicle_id")["bakım"].sum())

In [370]:
dd.reset_index(inplace=True)

In [371]:
dd.head(1)

Unnamed: 0,vehicle_id,bakım
0,5000001,1


In [None]:
df_ = df_.merge(df2_, on = "base_customer_id", how='left')

In [372]:
df6_yeni = aa.merge(cc, on = "vehicle_id", how = "left")

In [373]:
df6_yeni = df6_yeni.merge(dd, on = "vehicle_id", how = "left")

In [374]:
df6_yeni.head()

Unnamed: 0,vehicle_id,total_amount_tl,servis,bakım
0,5000001,3220.0,4,1
1,5000003,943.0,0,1
2,5000007,1689.0,0,1
3,5000008,0.0,0,1
4,5000013,618.0,0,4


In [375]:
df6_yeni.shape

(225447, 4)

In [376]:
df6_yeni["servis"].value_counts()

1     61130
0     55859
2     37061
3     25005
4     15960
5     10649
6      7004
7      4653
8      2809
9      1897
10     1257
11      839
12      450
13      275
14      178
15      120
16       90
17       65
18       46
19       29
20       22
21       15
22        7
25        4
23        3
24        3
28        3
29        3
32        2
33        2
26        2
31        2
30        1
36        1
37        1
Name: servis, dtype: int64

In [377]:
df6_yeni["bakım"].value_counts()

1     71126
0     43420
2     37748
3     23069
4     14985
5      9663
6      6775
7      4873
8      3352
9      2497
10     1832
11     1322
12     1078
13      751
14      565
15      474
16      367
17      294
18      234
19      185
20      134
21      118
22      103
23       85
24       56
25       41
28       37
27       36
26       34
29       25
30       25
32       23
34       17
33       14
35       14
31       10
42        8
39        7
36        7
37        4
38        4
51        4
50        4
47        4
45        3
46        3
43        3
41        3
40        2
52        2
44        1
48        1
54        1
55        1
57        1
63        1
64        1
Name: bakım, dtype: int64