##Preparing Dataset

In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/KUxDEPA-Data-Science-and-Machine-Learning-Training-Course/dataset/01-census-income.csv')

In [None]:
df2 = pd.read_csv('/content/drive/MyDrive/KUxDEPA-Data-Science-and-Machine-Learning-Training-Course/dataset/02-future-census.csv')

In [None]:
df1.columns

Index(['age', 'workclass', 'weight', 'education', 'edu num', 'marital status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native country', 'label'],
      dtype='object')

In [None]:
cols = ['age','edu num','marital status', 'sex',  'capital-gain',
       'capital-loss', 'hours-per-week']

In [None]:
X_train = df1[cols]
X_test = df2[cols]

In [None]:
y_train = df1['label']
y_test = df2['label']

##Transform Data

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [None]:
transformer = make_column_transformer(
  ( OneHotEncoder(), ['marital status', 'sex'] ),
  remainder='passthrough'
)

In [None]:
X_train_transformed = transformer.fit_transform(X_train)

In [None]:
X_test_transformed = transformer.transform(X_test)

##Decision Tree

###Fitting Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train_transformed,y_train)

DecisionTreeClassifier()

In [None]:
dtree.score(X_train_transformed,y_train)

0.9190824821396941

In [None]:
dtree.score(X_test_transformed,y_test)

0.834971334971335

###Fitting Model with Hyperparameter

In [None]:
dtree2 = DecisionTreeClassifier(max_depth=10,
                                min_samples_split=30,
                                min_samples_leaf=200)

In [None]:
dtree2.fit(X_train_transformed,y_train)

DecisionTreeClassifier(max_depth=10, min_samples_leaf=200, min_samples_split=30)

In [None]:
dtree2.score(X_train_transformed,y_train)

0.8547853811182617

In [None]:
dtree2.score(X_test_transformed,y_test)

0.8530917280917281

###Feature importance

In [None]:
transformer.get_feature_names_out()

array(['onehotencoder__marital status_ Divorced',
       'onehotencoder__marital status_ Married-AF-spouse',
       'onehotencoder__marital status_ Married-civ-spouse',
       'onehotencoder__marital status_ Married-spouse-absent',
       'onehotencoder__marital status_ Never-married',
       'onehotencoder__marital status_ Separated',
       'onehotencoder__marital status_ Widowed',
       'onehotencoder__sex_ Female', 'onehotencoder__sex_ Male',
       'remainder__age', 'remainder__edu num', 'remainder__capital-gain',
       'remainder__capital-loss', 'remainder__hours-per-week'],
      dtype=object)

In [None]:
dtree2.feature_importances_

array([1.29172835e-04, 0.00000000e+00, 4.24105507e-01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.60874408e-03,
       6.17295144e-04, 5.19053202e-02, 2.34828667e-01, 2.00893017e-01,
       4.47150984e-02, 4.01971775e-02])

###Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'max_depth':[5,10],
              'min_samples_split':[30,60],
              'min_samples_leaf':[100,200]
}
param_grid

{'max_depth': [5, 10],
 'min_samples_leaf': [100, 200],
 'min_samples_split': [30, 60]}

In [None]:
dtree_search = DecisionTreeClassifier()
grid_search = GridSearchCV(dtree_search,
                           param_grid,
                           cv=5)

In [None]:
grid_search.fit(X_train_transformed,y_train)
grid_search.best_params_,grid_search.best_score_

({'max_depth': 10, 'min_samples_leaf': 200, 'min_samples_split': 30},
 0.8534865172566695)

##Decision Tree Regressor

###Preparing and Split Dataset

In [None]:
df3 = pd.read_csv('/content/drive/MyDrive/KUxDEPA-Data-Science-and-Machine-Learning-Training-Course/dataset/03-cities.csv',encoding='tis620')

In [None]:
X = df3[['latitude','longitude']]
y = df3[['temperature']]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

###Fitting model

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train,y_train)

DecisionTreeRegressor()

In [None]:
dt_reg.score(X_test,y_test)

0.5380030422785111

In [None]:
dt_reg.score(X_train,y_train)

1.0