# Projekt modułów

In [1]:
import pandas as pd
import numpy as np 
from IPython.display import display

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from xgboost import XGBRegressor

import tensorflow as tf

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error

In [5]:
def create_linear_regression(n_j=2):
    lr = LinearRegression(n_jobs=n_j)
    return lr

def create_decision_tree_regressor(depth,min_split,spliter = 'best'):
    dt = DecisionTreeRegressor(max_depth=depth,min_samples_split=min_split,splitter=spliter,random_state=0)
    return dt

def create_random_forest_regressor(depth, number_tree=100, criterion="squared_error", n_j=2):
    rfr = RandomForestRegressor(n_estimators=number_tree,max_depth=depth,criterion=criterion,n_jobs=n_j,random_state=0)
    return rfr

def create_XGBR(number_tree=250,learn_rate=0.01,tree_met="hist"):
    xgb = XGBRegressor(n_estimator=number_tree,learning_rate=learn_rate,tree_method=tree_met,random_state=0)
    return xgb

def create_KNNR(nn=3,weight='uniform',n_j=4):
    knn = KNeighborsRegressor(n_neighbors=nn,weights=weight,n_jobs=n_j)
    return knn

def create_nn(number_hiden_layers=2,dense=[128,64],loss='mean_absolute_error',
              learning_rate=0.001,
              metrics=[tf.keras.metrics.MeanSquaredError(name='mean_squared_error'),
                       tf.keras.metrics.RootMeanSquaredError(name='root_mean_squared_errorasawewa')],
              input_shape=(16,)):
                           model = tf.keras.Sequential()
                           model.add(tf.keras.layers.Input(shape=input_shape))

                           for i in range(number_hiden_layers):
                               model.add(tf.keras.layers.Dense(dense[i], activation='relu'))
                               
                           model.add(tf.keras.layers.Dense(1))
                           model.compile(
                               loss=loss,
                               optimizer=tf.keras.optimizers.Adam(learning_rate),
                               metrics=metrics
                           )
                           return model

def read_dataset(number: int):
    data = pd.read_csv('./Data/DataSets/DataSet' + str(number) + '.csv')
    data
    return data

def is_null(data):
    is_nan = data.isnull().sum()
    print(is_nan)

def heatmap(data):
    corr = data.corr()
    mask = np.triu(np.ones_like(corr,dtype=bool))
    f,ax = plt.subplots(figsize=(11,9))
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    sns.heatmap(corr,mask=mask,cmap=cmap,center=0,vmax=0.8,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.show()

def describe_data(data):
    des = data.describe()
    display(des)

def prepare_data_to_train(data):
    X = data.iloc[:,:-1]
    y = data['Population']
    return X,y

def split_to_train_test(X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    return X_train, X_test, y_train, y_test

def cross_val_fit(model, X, y,metrics ,cv=5):
    cv_score = cross_validate(model, X, y, cv=cv,scoring=metrics)
    test_metrics = [x for x in cv_score if x.startswith('test')]
    for k in range(cv):
        print(f'Metrics in {k+1}-fold:')
        for metric in test_metrics:
            print(f'{metric.split("test_")[1]}: {cv_score[metric][k]:.2f}')
 
def nn_fit(model, X_train, X_test, y_train, y_test, n_epoch):
    model.fit(X_train, y_train, epochs=n_epoch, validation_data=(X_test,y_test))
    metric = model.evaluate(X_test,y_test,return_dict=True)
    print(metric)

def select_problem():
    try:
        problem = int(input('''
        Select which type of problem from 1 to 3:
        1. Regression
        2. Classification
        3. Clustering
        '''))
        if problem == 1:
            return problem
        elif problem > 1 and problem<=3:
            print('To be added in the future')
            select_problem()
        else:
            print('Please select number from 1 to 3')
            select_problem()
    except ValueError:
        print('Wrong type. Please select number from 1 to 3')
        select_problem()

def select_data():
    try:
        n_dataset = int(input('Select dataset 1 to 14 (description of data sets in the documentation): '))
        if n_dataset>0 and n_dataset<=14:
            dataset = read_dataset(n_dataset)
            return dataset
        else:
            print('Please select number from 1 to 14')
            select_data()
    except ValueError:
        print('Wrong type. Please select number from 1 to 14')

def visualiztaion_data_module(data):
    visual_input = ''
    while visual_input != 'stop':
        visual_input = input('''
        Select one option from below or write "stop" to end visualization module:
        1.Heatmap -> plot heat map
        2.Isnull -> is NaN value
        3.Describe data set 
        ''').lower()
        match visual_input :
            case 'heatmap' | '1':
                heatmap(data)
            case 'isnul' | '2':
                is_null(data)
            case 'describe' | '3':
                describe_data(data)
            case 'stop':
                print('Ending visualiztion module')
            case _:
                print('Wrong choice')

def check_param_num_float(min_val,max_val):
    try:
        x = float(input(f'Write float value between {min_val} and {max_val} (value with "." instead ",")'))
        if x>=min_val and x<=max_val :
            return x
        else:
            print('Wrong range')
            check_parm_num(min_val,max_val)
    except ValueError:
        print('Wrong type')
        check_parm_num(min_val,max_val)


def check_param_num_int(min_val,max_val):
    try:
        x = int(input(f'Write integer value between {min_val} and {max_val}'))
        if x>=min_val and x<=max_val :
            return x
        else:
            print('Wrong range')
            check_parm_num(min_val,max_val)
    except ValueError:
        print('Wrong type')
        check_parm_num(min_val,max_val)

def check_param_str(params):
    print('Select one from below:')
    for param in params:
        print(param)
    x = input('Write: ').lower()
    if x in params:
        return x
    else:
        print('Wrong choice')
        check_param_str(params)

def model_selection(X):
    choose_model = input('''
    Choose from the following models:
    1. Decision tree
    2. Random forest
    3. kNN
    4. Linear regression
    5. XGBR
    6. Neural network 
    ''').lower()
    is_nn = 0
    match choose_model:
        case 'decision tree' | '1':
            decision = input('Do you want to use a predefined model: Yes or No ').lower()
            
            if decision == 'yes' or decision =='y':
                model = create_decision_tree_regressor(int(X.shape[0]/2),2,'best')
                return model, is_nn
            elif decision == 'no' or decision =='n':
                print('Select max depth of tree')
                depth  = check_param_num_int(2,X.shape[0])
                print('Select minimum number of samples')
                min_split = check_param_num_int(2,int(X.shape[0]))
                print('Choose the split at each node')
                spliter = check_param_str(['best', 'random'])
                model = create_decision_tree_regressor(depth,min_split,spliter)
                return model, is_nn
            else:
                print('Wrong answer')
                model_selection(X,y)
                
        case 'random forest' | '2':
            decision = input('Do you want to use a predefined model: Yes or No ').lower()
            
            if decision == 'yes' or decision =='y':
                model = create_random_forest_regressor(int(X.shape[0]/2))
                return model, is_nn
                
            elif decision == 'no' or decision =='n':
                print('Select max depth of trees')
                depth = check_param_num_int(2,X.shape[0])
                print('Select number of trees in the forest')
                number_tree = check_param_num_int(50,500)
                print('Select criterion')
                criterion= check_param_str(['squared_error', 'absolute_error', 'friedman_mse', 'poisson'])
                print('Select number of jobs to run in parallel')
                n_j = check_param_num_int(1,8)
                model = create_random_forest_regressor(depth,number_tree,criterion,n_j)
                return model, is_nn
                
            else:
                print('Wrong answer')
                model_selection(X,y)
                
        case 'knn' | '3':
            decision = input('Do you want to use a predefined model: Yes or No ').lower()
            
            if decision == 'yes' or decision =='y':
                model = create_KNNR()
                return model, is_nn
                
            elif decision == 'no' or decision =='n':
                print('Select number of neighbors')
                nn = check_param_num_int(3,50)
                print('Select weight function used in prediction')
                weight = check_param_str(['uniform', 'distance'])
                print('Select number of jobs to run in parallel')
                n_j = check_param_num_int(1,8)
                model = create_KNNR(nn,weight,n_j)
                return model, is_nn
                
            else:
                print('Wrong answer')
                model_selection(X,y)
                
        case 'linear regression' | '4':
            decision = input('Do you want to use a predefined model: Yes or No ').lower()
            
            if decision == 'yes' or decision =='y':
                model = create_linear_regression()
                return model, is_nn
                
            elif decision == 'no' or decision =='n':
                print('Select number of jobs to run in parallel')
                n_j = check_param_num_int(1,8)
                model = create_linear_regression(n_j)
                return model, is_nn
                
            else:
                print('Wrong answer')
                model_selection(X,y)
                
        case 'XGBR' | '5':
            decision = input('Do you want to use a predefined model: Yes or No ').lower()
            
            if decision == 'yes' or decision =='y':
                model = create_XGBR()
                return model, is_nn
                
            elif decision == 'no' or decision =='n':
                print('Select max depth of trees')
                number_tree = check_param_num_int(50,500)
                print('Select learning rate')
                learn_rate = check_param_num_float(0,1)
                print('Select tree method')
                tree_met = check_param_str(['exact', 'approx', 'hist'])
                model = create_XGBR(number_tree,learn_rate,tree_met)
                return model, is_nn
                
            else:
                print('Wrong answer')
                model_selection(X,y)
                
        case 'neural network' | '6':
            is_nn = 1
            decision = input('Do you want to use a predefined model: Yes or No').lower()
            
            if decision == 'yes' or decision =='y':
                model = create_nn(2,[128,64],'mean_absolute_error',0.001,
                          [tf.keras.metrics.MeanSquaredError(name='mean_squared_error'),tf.keras.metrics.RootMeanSquaredError(name='root_mean_squared_error')],
                          X.shape[1])
                return model, is_nn
                
            elif decision == 'no' or decision =='n':
                print('Select the number of hidden layers')
                hiden = check_param_num_int(1,10)
                dense = []
                
                for i in range(hiden):
                    print(f'Select number of neurons in {i+1} hiden layer')
                    num_neurons = check_param_num_int(2,1024)
                    dense.append(num_neurons)
                    
                loss_func = check_param_str(['mean_absolute_error','mean_squared_error','root_mean_squared_error'])
                
                print('Select learning rate')
                learn_rate = check_param_num_float(0,1)
                
                metrics = metrics_for_train(['mean_absolute_error','mean_squared_error','root_mean_squared_error'])
                model = create_nn(hiden,dense,loss_func,learn_rate,metrics,X.shape[1])
                return model, is_nn
                
            else:
                print('Wrong answer')
                model_selection(X,y)
                
        case _:
            print('Wrong choice')
            model_selection(X)

def metrics_for_train(metric_acc):
    metric = ''
    metrics = []
    i = 0 
    while (metric != 'stop'):
        if i == len(metric_acc):
            break
        print('''
        Select metrics from below or stop 
        ''')
        for m in metric_acc:
            print(m)

        metric= input()
        if metric in metrics:
            print('Please choose a different metric')
        elif metric in metric_acc:
            if metric == 'root_mean_squared_error':
                metrics.append(tf.keras.metrics.RootMeanSquaredError(name='root_mean_squared_error'))
            else:
                metrics.append(metric)
                i+=1
        elif metric == 'stop' and len(metrics)!=0:
            print('End of choice of metrics')
        elif metric == 'stop' and len(metrics)==0:
            print('You must choose one metric')
        else:
            print('Wrong choice')

    return metrics

def trening_models(model,X,y):
    print('Specify the number of k-folds in cross-validation')
    cv = check_param_num_int(1,20)
    metrics = metrics_for_train(['max_error','neg_mean_absolute_error','neg_mean_squared_error','neg_root_mean_squared_error'])
    if len(metrics)>1:
        cross_val_fit(model,X,y,metrics=metrics,cv=cv)
    else: 
        cross_val_fit(model,X,y,metrics[0],cv)

def trening_nn(model,X,y):
    print('Define what percentage will be test data, where 0.2 equals 20%: ')
    test_size = check_param_num_float(0,1)
    X_train, X_test, y_train, y_test = split_to_train_test(X,y,test_size)
    print('How many epochs the neural network has to learn')
    n_epochs = check_param_num_int(1,1000)
    nn_fit(model, X_train, X_test, y_train, y_test, n_epochs)

In [3]:
#Start function
def start_modules():
    print('Start AI project')
    slected_problem = select_problem()
    dataset = select_data()
    visualiztaion_data_module(dataset)
    X,y = prepare_data_to_train(dataset)
    is_continue = ''
    while is_continue != 'no':
        model,is_nn = model_selection(X)
        if is_nn == 0:
            trening_models(model,X,y)
            print('Do you want to try a different model')
            is_continue = check_param_str(['yes','no'])
        elif is_nn == 1:
            trening_nn(model,X,y)
            print('Do you want to try a different model')
            is_continue = check_param_str(['yes','no'])

In [6]:
start_modules()

Start AI project



        Select which type of problem from 1 to 3:
        1. Regression
        2. Classification
        3. Clustering
         1
Select dataset 1 to 14 (description of data sets in the documentation):  5

        Select one option from below or write "stop" to end visualization module:
        1.Heatmap -> plot heat map
        2.Isnull -> is NaN value
        3.Describe data set 
         stop


Ending visualiztion module



    Choose from the following models:
    1. Decision tree
    2. Random forest
    3. kNN
    4. Linear regression
    5. XGBR
    6. Neural network 
     a


Wrong choice



    Choose from the following models:
    1. Decision tree
    2. Random forest
    3. kNN
    4. Linear regression
    5. XGBR
    6. Neural network 
     1


KeyboardInterrupt: Interrupted by user