In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt 
import seaborn as sns


df = pd.read_csv("data.csv")

In [2]:
#Preprocessing

df.columns = df.columns.str.lower().str.replace(' ','_')
#convertir a lista solo las columnas que tengan dtype == object
string_columns = list(df.dtypes[df.dtypes =='object'].index)
for col in string_columns:
    #recorrer la lista creada y usar cada elemento para reemplazar su contenido por minusculas y los espacios por _
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [3]:
base = ['engine_hp','engine_cylinders','highway_mpg', 'city_mpg', 'popularity']
n = len(df)

n_valid = int( 0.2 * n)
n_test = int( 0.2 * n)

n_train = n - (n_valid + n_test) 

np.random.seed(2)
idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

"""
        definiendo los dataframes
"""
df_train = df_shuffled.iloc[:n_train].copy()
df_valid = df_shuffled.iloc[n_train:n_train+n_valid].copy()
df_test = df_shuffled.iloc[n_train+n_valid:].copy()

"""
        sacando target de cada uno
"""
y_train = np.log1p(df_train.msrp.values)
y_valid = np.log1p(df_valid.msrp.values)
y_test = np.log1p(df_test.msrp.values)
del df_train['msrp']
del df_valid['msrp']
del df_test['msrp']






In [4]:

"""
Funcion para calcular rmse
"""

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

"""
funcion para linear regresion no regularizada
"""
def train_linear_regression(X,y):
    #adding the dummy column
    ones= np.ones(X.shape[0])
    X = np.column_stack([ones,X])

    #normal equation formula

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

"""
funcion para linear regression regularizada
"""
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])          
    X = np.column_stack([ones, X])      

    XTX = X.T.dot(X)                    
    reg = r * np.eye(XTX.shape[0])      
    XTX = XTX + reg                     

    XTX_inv = np.linalg.inv(XTX)        
    w = XTX_inv.dot(X.T).dot(y)         

    return w[0], w[1:]

"""
Funcion para crear columnas categoricas en el DF
"""

def binary(df, column, n_values, features):
    df_copy = df.copy()
    for elem in df_copy[column].value_counts().head(n_values).index.tolist():
        new_column_name = f'{column}_{elem}'
        df[new_column_name] = (df_copy[column] == elem).astype(int)
        features.append(new_column_name)
    return df


"""
Funcion para preparar array X
"""


def prepare_X(df):   
    df = df.copy()
    features = base.copy()
    
    df['age'] = 2017 - df.year
    features.append('age')          
    
    categorical_columns = ['number_of_doors','make', 'engine_fuel_type', 'transmission_type', 'driven_wheels',
                            'market_category', 'vehicle_size', 'vehicle_style']
    
    for column in categorical_columns:
        df = binary(df, column, 9, features)                                   

    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

<h3>Testing the model

In [49]:
X_train = prepare_X(df_train)

In [50]:
w_0, w = train_linear_regression_reg(X_train, y_train, r= 0.01)

In [46]:
X_valid = prepare_X(df_valid)

In [47]:
y_pred = w_0 + X_valid.dot(w)

In [48]:
print('prediction: ', rmse(y_valid, y_pred))

prediction:  0.4834399565307656
