In [224]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import re
import statsmodels.api as sm
from statsmodels.formula.api import ols

%matplotlib inline
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [225]:
def get_num_columns(database: pd.DataFrame) -> pd.DataFrame:
  _nums = database.select_dtypes(include = 'number').columns.to_list()

  return database[_nums].copy()

def get_obj_columns(database: pd.DataFrame) -> pd.DataFrame:
  _objs = database.select_dtypes(include = 'object').columns.to_list()

  return database[ _objs].copy()

def check_correlation_p_value(column1: pd.Series, column2: pd.Series,
                              coef_threshold = 0.5, p_threshold = 0.05,
                              include = None) -> list:


  includes = [None, 'all']
  assert include in includes, "include just be None or 'all'"

  _coef, _p_value = stats.pearsonr(column1, column2)

  if _p_value > p_threshold:
    return False
  if _coef >= coef_threshold or _coef <= -1.0*coef_threshold:
    if include == 'all':
      return [True, _coef, _p_value]
    return True

  return False

def check_affection(database: pd.DataFrame, target_column: str, coef_threshold = 0.1, p_threshold = 0.05, include = None) -> list:



  includes = [None, 'all']
  assert include in includes, "include just be None or 'all'"
  database = get_num_columns(database)

  _affection = []

  for i in database.columns:
    if i == target_column:
      continue
    _affect = check_correlation_p_value(database[i], database[target_column], coef_threshold = coef_threshold, p_threshold = p_threshold, include=include)
    if _affect and include == 'all':
      _affection.append([i, _affect[1], _affect[2]])
    elif _affect:
      _affection.append(i)


  return _affection

def get_strong_week_affection(database: pd.DataFrame, target_column: str, coef_threshold = [0.3, 0.5, 0.8], p_threshold = 0.05) -> dict:
  _strong_affection = []
  _week_affection = []
  _median_affection = []
  database = get_num_columns(database)

  _affection = check_affection(database= database, target_column= target_column, coef_threshold=coef_threshold[0], p_threshold=p_threshold, include = 'all')

  for column in _affection:
    X = {'name': column[0],
         'coef': column[1],
         'p_value': column[2]}
    if X['coef'] <= -1*coef_threshold[2] or  X['coef'] >= 1*coef_threshold[2]:
      _strong_affection.append(X)
    elif X['coef'] >= -1*coef_threshold[1] and X['coef'] <= coef_threshold[1]:
      _week_affection.append(X)
    else:
      _median_affection.append(X)



  return {'strong_affection': _strong_affection,
          'week_affection': _week_affection,
          'median_affection': _median_affection}


def get_category_influence(df:pd.DataFrame, target_column, get_top = None, plimit = 1e-04, ascending =True):
  _df = get_obj_columns(df)
  category = list(_df.columns)
  _category = [re.sub('\W', '', i) for i in category]
  _df.columns = _category
  _df[target_column] = df[target_column]


  model = ols(target_column + ' ~ ' + ' + '.join(_category), data = _df).fit()
  # thực hiện kiểm định ANOVA
  anova_table = sm.stats.anova_lm(model).reset_index()

  category.append('Residual')
  anova_table['index'] = category

  if ascending:
    anova_table = anova_table[anova_table['PR(>F)'] < plimit]
  else:
    anova_table = anova_table[anova_table['PR(>F)'] >= plimit]


  try:
    get_top = int(get_top)

    anova_table = anova_table.sort_values(by = 'PR(>F)', ascending= ascending)[:get_top]

    return anova_table['index'].to_list(), anova_table['PR(>F)'].to_list()
  except:
    anova_table = anova_table.sort_values(by = 'PR(>F)', ascending= ascending)
    return anova_table['index'].to_list(), anova_table['PR(>F)'].to_list()


In [226]:
## read data
df = pd.read_csv('Model-Evaluation-and-Refinement-R.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,0,0,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,0.890278,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0,11.190476,Medium,0,1
1,1,1,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,0.890278,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0,11.190476,Medium,0,1
2,2,2,1,122,alfa-romero,std,two,hatchback,rwd,front,94.5,0.822681,0.909722,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0,12.368421,Medium,0,1
3,3,3,2,164,audi,std,four,sedan,fwd,front,99.8,0.84863,0.919444,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0,9.791667,Medium,0,1
4,4,4,2,164,audi,std,four,sedan,4wd,front,99.4,0.84863,0.922222,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0,13.055556,Medium,0,1


In [227]:
df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1, inplace = True)
df.head()

Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,0.890278,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0,11.190476,Medium,0,1
1,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,0.890278,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0,11.190476,Medium,0,1
2,1,122,alfa-romero,std,two,hatchback,rwd,front,94.5,0.822681,0.909722,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0,12.368421,Medium,0,1
3,2,164,audi,std,four,sedan,fwd,front,99.8,0.84863,0.919444,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0,9.791667,Medium,0,1
4,2,164,audi,std,four,sedan,4wd,front,99.4,0.84863,0.922222,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0,13.055556,Medium,0,1


In [228]:
df.isnull().sum()

symboling            0
normalized-losses    0
make                 0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               4
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
city-L/100km         0
horsepower-binned    1
diesel               0
gas                  0
dtype: int64

In [229]:
df['stroke'].fillna(df['stroke'].mean(), inplace = True)

In [230]:
df['stroke'].fillna(df['stroke'].mean(), inplace=True)
bin_min = df[df['horsepower-binned'] == 'Low']['horsepower'].max()
bin_max = df[df['horsepower-binned'] == 'High']['horsepower'].min()

def fill_bins(row, column, column_bin, bin_min, bin_max):
    if row[column] < bin_min:
        row[column_bin] = 'Low'
    elif row[column] > bin_max:
        row[column_bin] = 'High'
    else:
        row[column_bin] = 'Medium'

    return row

#df = df.apply(lambda row: fill_bins(row, 'horsepower', 'horsepower-binned', bin_min, bin_max), axis=1)

In [231]:
df = df.apply(lambda row: fill_bins(row, 'horsepower', 'horsepower-binned', bin_min, bin_max), axis = 1)
df.isnull().sum().sum()

0

In [232]:
num_affection = get_strong_week_affection(df, 'price')['strong_affection']
num_affection

[{'name': 'curb-weight',
  'coef': 0.8344145257702845,
  'p_value': 2.1895772388939654e-53},
 {'name': 'engine-size',
  'coef': 0.8723351674455188,
  'p_value': 9.26549162219582e-64},
 {'name': 'horsepower',
  'coef': 0.8095745670036559,
  'p_value': 6.36905742825956e-48}]

In [233]:
num_df = [i['name'] for i in num_affection]
num_df

['curb-weight', 'engine-size', 'horsepower']

In [234]:
obj_df = get_category_influence(df, target_column='price')[0]
obj_df

['make',
 'drive-wheels',
 'engine-type',
 'num-of-cylinders',
 'horsepower-binned',
 'aspiration',
 'engine-location']

In [235]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [236]:
def monomial_metrics(_input: pd.Series, _output: pd.Series,
                   model = LinearRegression(), ) -> dict:
    if len(_input.shape) == 1:
        _name = _input.name
        _input = np.reshape(_input.values, (-1, 1))
    else: 
        _name = ', '.join(_input.columns)

    model.fit(_input, _output)
    _pred = model.predict(_input)

    mse = mean_squared_error(_output, _pred)
    r2 = r2_score(_output, _pred)
    
    return {'variable': _name,
            'MSE': mse,
            'R^2': r2}

In [237]:
monomial_metrics(df[num_df], df['price'])

{'variable': 'curb-weight, engine-size, horsepower',
 'MSE': 11994814.244502736,
 'R^2': 0.8091263794284135}

In [238]:
monomial_metrics(df['curb-weight'], df['price'])

{'variable': 'curb-weight',
 'MSE': 19088303.525747165,
 'R^2': 0.6962476008164488}

In [239]:
def polinomial_metrics(_input: pd.Series, _output: pd.Series,
                      degree = 1, model = None, csr_matrix = False) -> dict:

    if model == None:
        model = Pipeline([
            ('scale', StandardScaler()),
            ('polynomial', PolynomialFeatures(degree=degree)),
            ('model', LinearRegression())
        ])
        
    if csr_matrix == True:
        _name = 'all'
        
    elif len(_input.shape) == 1:
        _name = _input.name
        _input = np.reshape(_input.values, (-1, 1))
    else: 
        _name = ', '.join(_input.columns)
        
    

    model.fit(_input, _output)
    _pred = model.predict(_input)

    mse = mean_squared_error(_output, _pred)
    r2 = r2_score(_output, _pred)
    
    return {'variable': _name,
            'degree': degree,
            'MSE': mse,
            'R^2': r2}
        
    

In [240]:
polinomial_metrics(df[num_df[0]], df['price'], degree = 1)

{'variable': 'curb-weight',
 'degree': 1,
 'MSE': 19088303.525747165,
 'R^2': 0.6962476008164488}

In [241]:
total = {}
index = 0
for degree in range(1, 5):
    for i in num_df:
        total[index] = polinomial_metrics(df[i], df['price'], degree = degree)
        index += 1
        
    total[index] = polinomial_metrics(df[num_df], df['price'], degree = degree)
    index += 1
    
    

In [242]:
pd.DataFrame.from_dict(total).T\
.sort_values(['R^2'], ascending = False)

Unnamed: 0,variable,degree,MSE,R^2
11,"curb-weight, engine-size, horsepower",3,7233959.768192,0.884886
7,"curb-weight, engine-size, horsepower",2,10085920.965355,0.839503
3,"curb-weight, engine-size, horsepower",1,11994814.244503,0.809126
13,engine-size,4,13187054.245012,0.790154
9,engine-size,3,13187196.083556,0.790152
5,engine-size,2,15016546.995871,0.761042
1,engine-size,1,15021126.025174,0.760969
15,"curb-weight, engine-size, horsepower",4,15567841.191542,0.752269
12,curb-weight,4,17575054.567307,0.720328
8,curb-weight,3,17591002.681222,0.720074


In [243]:
obj_df

['make',
 'drive-wheels',
 'engine-type',
 'num-of-cylinders',
 'horsepower-binned',
 'aspiration',
 'engine-location']

In [244]:
def preprocesing_dataX(num_df: pd.Series, obj_df:pd.Series, model = None):
    
    X = pd.DataFrame()
    ## obj_df
    for i, obj in enumerate(obj_df.columns):
        label = LabelEncoder()
        encode = label.fit_transform(obj_df[obj])
        X[obj] = encode
        
    ## num_df
    scale = StandardScaler()
    X[num_df.columns] = scale.fit_transform(num_df)
    
    ## column transformer
    ct = ColumnTransformer([('town', OneHotEncoder(), [0, 1, 2])], remainder = 'passthrough')
    X = ct.fit_transform(X)
    
    return X
        

In [245]:
preprocesing_dataX(df[num_df], df[obj_df])

<201x38 sparse matrix of type '<class 'numpy.float64'>'
	with 1621 stored elements in Compressed Sparse Row format>

In [246]:
polinomial_metrics(X, y, csr_matrix = True, model=LinearRegression())

{'variable': 'all',
 'degree': 1,
 'MSE': 80949.65269160659,
 'R^2': 0.9987118472217825}