In [1]:
#!pip install pyforest

# 1-Import Libraies
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline
import statsmodels.api as sm
import statsmodels.formula.api as smf
import missingno as msno 

from sklearn.compose import make_column_transformer

#Scaling
from sklearn.preprocessing import scale 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler


#Importing plotly and cufflinks in offline mode
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

#Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

#Figure&Display options
plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('max_colwidth',200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#!pip install termcolor
import colorama
from colorama import Fore, Style  # maakes strings colored
from termcolor import colored

In [2]:
## Useful Functions

###############################################################################

def missing(df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values

def missing_values(df):
    return missing(df)[missing(df)['Missing_Number']>0]

###############################################################################

def first_looking(df):
    print(colored("Shape:", attrs=['bold']), df.shape,'\n',
          f"There is ", df.shape[0], " observation and ", df.shape[1], " columns in the dataset.", '\n',
          colored('-'*79, 'red', attrs=['bold']),
          colored("\nInfo:\n", attrs=['bold']), sep='')
    print(df.info(), '\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("Number of Uniques:\n", attrs=['bold']), df.nunique(),'\n',
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("Missing Values:\n", attrs=['bold']), missing_values(df),'\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("All Columns:", attrs=['bold']), list(df.columns),'\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')

    df.columns= df.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')

    print(colored("Columns after rename:", attrs=['bold']), list(df.columns),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')

def duplicate_values(df):
    duplicate_values = df.duplicated(subset=None, keep='first').sum()
    if duplicate_values > 0:
        df.drop_duplicates(keep='first', inplace=True)
        print(duplicate_values, colored("duplicates were dropped", attrs=['bold']),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
    else:
        print(colored("No duplicates", attrs=['bold']),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
        
def drop_columns(df, drop_columns):
    if drop_columns !=[]:
        df.drop(drop_columns, axis=1, inplace=True)
        print(drop_columns, 'were dropped')
    else:
        print(colored('We will now check the missing values and if necessary drop some columns!!!', attrs=['bold']),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
        
def drop_null(df, limit):
    print('Shape:', df.shape)
    for i in df.isnull().sum().index:
        if (df.isnull().sum()[i]/df.shape[0]*100)>limit:
            print(df.isnull().sum()[i], 'percent of', i ,'null and were dropped')
            df.drop(i, axis=1, inplace=True)
            print('new shape:', df.shape)
        else:
            print(df.isnull().sum()[i], '%, percentage of missing values of', i ,'less than limit', limit, '%, so we will keep it.')
    print('New shape after missing value control:', df.shape)

###############################################################################

In [3]:
# 1-Load|Read Data
dataset_path = "D_FINAL_CUSTOMER_DATATHON.csv"
df1 = pd.read_csv(dataset_path)
first_looking(df1)
duplicate_values(df1)
drop_columns(df1,['unnamed:_0', 'gender_id', "marital_status_id"])
drop_null(df1, 90)

[1mShape:[0m(849919, 10)
There is 849919 observation and 10 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 849919 entries, 0 to 849918
Data columns (total 10 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Unnamed: 0                     849919 non-null  int64  
 1   BASE_CUSTOMER_ID               849919 non-null  int64  
 2   CUSTOMER_ID                    849919 non-null  int64  
 3   GENDER                         849916 non-null  object 
 4   GENDER_ID                      849919 non-null  int64  
 5   MARITAL_STATUS                 620802 non-null  object 
 6   MARITAL_STATUS_ID              849919 non-null  int64  
 7   BIRTH_DATE                     645447 non-null  float64
 8   FK_ADDRESS_COMMUNICATION_CITY  730341 non-null  object 
 9   OCCUPATION         

In [4]:
# 2-Load|Read Data
dataset_path = "D_FINAL_CUSTOMER_RELATED_TABLE_FOR_DATATHON.csv"
df2 = pd.read_csv(dataset_path)
first_looking(df2)
duplicate_values(df2)
drop_columns(df2,['unnamed:_0'])
drop_null(df2, 90)

[1mShape:[0m(374879, 7)
There is 374879 observation and 7 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374879 entries, 0 to 374878
Data columns (total 7 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   Unnamed: 0                      374879 non-null  int64 
 1   BASE_CUSTOMER_ID                374879 non-null  int64 
 2   VEHICLE_ID                      374879 non-null  int64 
 3   START_DATE                      374879 non-null  object
 4   END_DATE                        159757 non-null  object
 5   FK_RELATION_STATUS_ID           374879 non-null  int64 
 6   FK_RELATION_STATUS_EXPLANATION  374879 non-null  object
dtypes: int64(4), object(3)
memory usage: 20.0+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m


In [5]:
# 3-Load|Read Data
dataset_path = "D_FINAL_SALES_FILE_DATATHON.csv"
df3 = pd.read_csv(dataset_path)
first_looking(df3)
duplicate_values(df3)
drop_columns(df3,['unnamed:_0'])
drop_null(df3, 90)

[1mShape:[0m(397526, 7)
There is 397526 observation and 7 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397526 entries, 0 to 397525
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         397526 non-null  int64  
 1   CUSTOMER_ID        397526 non-null  int64  
 2   SALESFILE_ID       397526 non-null  float64
 3   SF_CREATE_DATE     397526 non-null  object 
 4   STATUS             397526 non-null  int64  
 5   REQ_BRAND_CODE     397526 non-null  object 
 6   REQ_TOPMODEL_CODE  397237 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 21.2+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mUnnamed: 0           397526
CUSTOMER_ID          183523
SALESFILE_ID 

In [6]:
# 4-Load|Read Data
dataset_path = "D_FINAL_SIFIR_ARAC_ALANLAR_DATATHON.csv"
df4 = pd.read_csv(dataset_path)
first_looking(df4)
duplicate_values(df4)
drop_columns(df4,['unnamed:_0'])
drop_null(df4, 90)

[1mShape:[0m(169525, 4)
There is 169525 observation and 4 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169525 entries, 0 to 169524
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   169525 non-null  int64 
 1   VEHICLE_ID   169525 non-null  int64 
 2   CUSTOMER_ID  169525 non-null  int64 
 3   CREATE_DATE  169525 non-null  object
dtypes: int64(3), object(1)
memory usage: 5.2+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mUnnamed: 0     169525
VEHICLE_ID     169521
CUSTOMER_ID    138432
CREATE_DATE    151242
dtype: int64
[1m[31m-------------------------------------------------------------------------------[0m
[1mMissing Values:
[0mEmpty DataFrame
Columns: [Missing_Number, Missing_Per

In [7]:
df4.rename({"create_date":"sıfır_create_date"}, axis=1, inplace=True)

In [8]:
# 5-Load|Read Data
dataset_path = "D_FINAL_VEHICLE_TABLE_DATATHON.csv"
df5 = pd.read_csv(dataset_path)
first_looking(df5)
duplicate_values(df5)
drop_columns(df5,['unnamed:_0'])
drop_null(df5, 90)

[1mShape:[0m(345092, 8)
There is 345092 observation and 8 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 345092 entries, 0 to 345091
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      345092 non-null  int64 
 1   VEHICLE_ID      345092 non-null  int64 
 2   TRAFFIC_DATE    344226 non-null  object
 3   BRAND_CODE      345092 non-null  object
 4   BASEMODEL_CODE  345092 non-null  int64 
 5   TOPMODEL_CODE   345092 non-null  int64 
 6   MOTOR_GAS_TYPE  272646 non-null  object
 7   GEAR_BOX_TYPE   261077 non-null  object
dtypes: int64(4), object(4)
memory usage: 21.1+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mUnnamed: 0        345092
VEHICLE_ID        345092
TRAFFIC_DATE        74

In [9]:
# 6-Load|Read Data
dataset_path = "D_MASK_SERVIS_BAKIM_DATATHON_FINAL.csv"
df6 = pd.read_csv(dataset_path)
first_looking(df6)
duplicate_values(df6)
drop_columns(df6,['unnamed:_0'])
drop_null(df6, 90)

[1mShape:[0m(1041000, 5)
There is 1041000 observation and 5 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041000 entries, 0 to 1040999
Data columns (total 5 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   Unnamed: 0       1041000 non-null  int64  
 1   CREATE_DATE      1041000 non-null  object 
 2   IS_MAINTENANCE   1041000 non-null  int64  
 3   VEHICLE_ID       1041000 non-null  int64  
 4   TOTAL_AMOUNT_TL  965596 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 39.7+ MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mUnnamed: 0         1041000
CREATE_DATE        1033883
IS_MAINTENANCE           2
VEHICLE_ID          225447
TOTAL_AMOUNT_TL      16006
dtype: int64
[1m[31m-------------

In [10]:
df6.rename({"create_date":"servis_create_date"}, axis=1, inplace=True)

In [11]:
# 7-Load|Read Data
dataset_path = "D_sample_submission.csv"
df7 = pd.read_csv(dataset_path)
first_looking(df7)
duplicate_values(df7)
drop_columns(df7,[])
# drop_null(df7, 90)

[1mShape:[0m(166943, 2)
There is 166943 observation and 2 columns in the dataset.
[1m[31m-------------------------------------------------------------------------------[0m[1m
Info:
[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166943 entries, 0 to 166942
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Id        166943 non-null  int64  
 1   Expected  0 non-null       float64
dtypes: float64(1), int64(1)
memory usage: 2.5 MB
None
[1m[31m-------------------------------------------------------------------------------[0m
[1mNumber of Uniques:
[0mId          166943
Expected         0
dtype: int64
[1m[31m-------------------------------------------------------------------------------[0m
[1mMissing Values:
[0m          Missing_Number  Missing_Percent
Expected          166943            1.000
[1m[31m-------------------------------------------------------------------------------[0m
[1mAll Columns:[

In [12]:
df7.columns

Index(['id', 'expected'], dtype='object')

In [13]:
df7.rename({"id" : "base_customer_id"}, axis=1, inplace=True)

In [14]:
df7.columns

Index(['base_customer_id', 'expected'], dtype='object')

In [15]:
df7.shape

(166943, 2)

In [16]:
df2.shape

(374879, 6)

In [17]:
df72 = df7.merge(df2, how = "left", on = ["base_customer_id"], )

In [18]:
df72.shape

(320181, 7)

In [19]:
df72.head()

Unnamed: 0,base_customer_id,expected,vehicle_id,start_date,end_date,fk_relation_status_id,fk_relation_status_explanation
0,168254,,5329232,2019-09-20,,1,Aktif Ruhsat Sahibi
1,168254,,5183621,2015-09-14,2019-07-05,2,Pasif Ruhsat Sahibi
2,168254,,5208488,2012-09-15,2014-02-19,2,Pasif Ruhsat Sahibi
3,168254,,5310923,2018-07-19,2019-08-16,2,Pasif Ruhsat Sahibi
4,168254,,5311914,2019-03-29,2021-02-10,2,Pasif Ruhsat Sahibi


In [20]:
pd.pivot(data=df72, index="base_customer_id", columns="fk_relation_status_explanation")

ValueError: Index contains duplicate entries, cannot reshape

In [15]:
df1.columns

Index(['base_customer_id', 'customer_id', 'gender', 'marital_status',
       'birth_date', 'fk_address_communication_city', 'occupation'],
      dtype='object')

In [16]:
df2.columns

Index(['base_customer_id', 'vehicle_id', 'start_date', 'end_date',
       'fk_relation_status_id', 'fk_relation_status_explanation'],
      dtype='object')

In [17]:
df3.columns

Index(['customer_id', 'salesfile_id', 'sf_create_date', 'status',
       'req_brand_code', 'req_topmodel_code'],
      dtype='object')

In [18]:
df4.columns

Index(['vehicle_id', 'customer_id', 'sıfır_create_date'], dtype='object')

In [19]:
df5.columns

Index(['vehicle_id', 'traffic_date', 'brand_code', 'basemodel_code',
       'topmodel_code', 'motor_gas_type', 'gear_box_type'],
      dtype='object')

In [20]:
df6.columns

Index(['servis_create_date', 'is_maintenance', 'vehicle_id',
       'total_amount_tl'],
      dtype='object')

In [21]:
df62 = df6.merge(df2, how = "outer", on = ["vehicle_id"], )

In [22]:
df625 = df62.merge(df5, how = "outer", on = ["vehicle_id"])

In [23]:
df625.head()

Unnamed: 0,servis_create_date,is_maintenance,vehicle_id,total_amount_tl,base_customer_id,start_date,end_date,fk_relation_status_id,fk_relation_status_explanation,traffic_date,brand_code,basemodel_code,topmodel_code,motor_gas_type,gear_box_type
0,2010-12-29 10:50:56,1.0,5000001,192.0,120581,2005-07-11,,1,Aktif Ruhsat Sahibi,1998-12-28,X,14911,9491,,
1,2013-08-21 10:34:21,1.0,5000001,267.0,120581,2005-07-11,,1,Aktif Ruhsat Sahibi,1998-12-28,X,14911,9491,,
2,2015-11-30 10:55:00,1.0,5000001,1747.0,120581,2005-07-11,,1,Aktif Ruhsat Sahibi,1998-12-28,X,14911,9491,,
3,2015-12-09 13:38:49,0.0,5000001,465.0,120581,2005-07-11,,1,Aktif Ruhsat Sahibi,1998-12-28,X,14911,9491,,
4,2017-11-23 09:53:22,1.0,5000001,549.0,120581,2005-07-11,,1,Aktif Ruhsat Sahibi,1998-12-28,X,14911,9491,,


In [24]:
df71 = df7.merge(df1, how = "left", on = ["base_customer_id"])

In [25]:
df713 = df71.merge(df3, how = "left", on = ["customer_id"])

In [26]:
df713.head()

Unnamed: 0,base_customer_id,expected,customer_id,gender,marital_status,birth_date,fk_address_communication_city,occupation,salesfile_id,sf_create_date,status,req_brand_code,req_topmodel_code
0,168254,,1507435,Erkek,Bekar,1988.0,Şanlıurfa,,,,,,
1,168254,,1663507,Erkek,Bekar,1988.0,Şanlıurfa,,,,,,
2,168254,,1507434,Erkek,Bekar,1988.0,Şanlıurfa,,8525045.0,2019-03-27 15:18:06,102.0,Y,9231.0
3,168254,,1507434,Erkek,Bekar,1988.0,Şanlıurfa,,343656.0,2010-11-23 16:15:05,102.0,Y,9201.0
4,168254,,1507434,Erkek,Bekar,1988.0,Şanlıurfa,,5908506.0,2016-09-30 12:00:53,105.0,Y,9231.0


In [27]:
df4.columns

Index(['vehicle_id', 'customer_id', 'sıfır_create_date'], dtype='object')

In [28]:
df713.columns

Index(['base_customer_id', 'expected', 'customer_id', 'gender',
       'marital_status', 'birth_date', 'fk_address_communication_city',
       'occupation', 'salesfile_id', 'sf_create_date', 'status',
       'req_brand_code', 'req_topmodel_code'],
      dtype='object')

In [29]:
df625.columns

Index(['servis_create_date', 'is_maintenance', 'vehicle_id', 'total_amount_tl',
       'base_customer_id', 'start_date', 'end_date', 'fk_relation_status_id',
       'fk_relation_status_explanation', 'traffic_date', 'brand_code',
       'basemodel_code', 'topmodel_code', 'motor_gas_type', 'gear_box_type'],
      dtype='object')

In [30]:
df_ = df713.merge(df625, how = "left", on = ["base_customer_id"])

In [31]:
df_.head(1)

Unnamed: 0,base_customer_id,expected,customer_id,gender,marital_status,birth_date,fk_address_communication_city,occupation,salesfile_id,sf_create_date,status,req_brand_code,req_topmodel_code,servis_create_date,is_maintenance,vehicle_id,total_amount_tl,start_date,end_date,fk_relation_status_id,fk_relation_status_explanation,traffic_date,brand_code,basemodel_code,topmodel_code,motor_gas_type,gear_box_type
0,168254,,1507435,Erkek,Bekar,1988.0,Şanlıurfa,,,,,,,2014-01-15 10:18:52,0.0,5183621,,2015-09-14,2019-07-05,2,Pasif Ruhsat Sahibi,2010-12-27,X,13042,9295,Dizel,Sürekli Değişken


In [None]:
df = df_.merge(df4, how = "left", on = ["customer_id"])

In [None]:
df.head(1)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.base_customer_id.nunique()

In [None]:
df[["base_customer_id", "customer_id", "traffic_date", "vehicle_id_x", "vehicle_id_y"]]