In [None]:
import pandas as pd
from DataCleaning import DataCleaning
from EDA import EDA
from KFoldCrossValidation import KFoldCrossValidation
from LinearRegression import LinearRegression

### 1. First, the dataset needs to be loaded. For the sake of simplicity, we are gonna use an object for loading and cleaning the dataset.

In [None]:
data_cleaner = DataCleaning('assets/auto_train.csv')
df = data_cleaner.df
data_cleaner.show_head(df)


 ##### 1. a. Checking the missing values
To understand the dataset, we need to se whick columns contain missing values

In [None]:
data_cleaner.get_nan_values(df)

##### 1. b. Cleaning the dataset of NaN columns
We are gonna drop the columns where at least 2/3 of the data is missing.

In [None]:
df = data_cleaner.delete_columns_with_nans(df)

##### 1. c. Check the rest of the missing values in the columns

To see what has to be done next in the cleaning phase, wee need to see which columns have missing values. We are gonna do that for both numerical and object columns

##### Checking the numerical data

In [None]:
numeric_columns = data_cleaner.df.select_dtypes(include=['number'])
numeric_columns_with_nan = numeric_columns.columns[numeric_columns.isna().any()].tolist()
numeric_df_with_nan = df[numeric_columns_with_nan]
numeric_df_with_nan.info()
numeric_dataframe = pd.DataFrame(numeric_df_with_nan)


##### Checking the object data

In [None]:
object_columns = data_cleaner.df.select_dtypes(include=['object'])
object_columns_with_nan = object_columns.columns[object_columns.isna().any()].tolist()
object_df_with_nan = df[object_columns_with_nan]
object_df_with_nan.info()
object_dataframe = pd.DataFrame(object_df_with_nan)


##### 1. d. Filling the missing numeric data

The numerical data will be filled with the median of the columns

In [None]:
df = data_cleaner.fill_nan_with_median(df)
object_df_with_nan.info()


##### 1. e. Dealing with categorical data

First, wee need to se what columns contain categorical data

In [None]:
data_cleaner.detect_categorical_data(df)

We will now fill the categorical data with the most frequent values in those columns

In [None]:
df = data_cleaner.fill_nan_with_frequent(df)

### 2. EDA

##### 2. a. Initializing EDA

In this part, we will take care of the EDA part. Because some of the columns are not relevant to the analysis, they will need to be dropped. Moving on, we need to plot de distribution based on the price column.

In [None]:
car_ids = df["id"]
print(car_ids)
df = df.drop(columns=['id', 'data', 'url'])
EDA_analyzer = EDA(df)
EDA_analyzer.show_distribution(df)


#### 2. b. Detecting outliers in columns

We can see that the distribution does not resemble a classic distribution, so therefore, transformations are required. For this, we need to determine the outliers.

In [None]:
price_column = df['pret']
numeric_outliers = EDA_analyzer.detect_outliers(df)
for col, outliers in numeric_outliers.items():
    print(f"Outliers in column '{col}':")
    print(outliers)
    print()
column_list = list(numeric_outliers.keys())
print(column_list)

#### 2. c. Plotting the outliers

In [None]:
EDA_analyzer.plot_outliers(df, numeric_outliers)

#### 2. d. Applying the logarithmic transformations

For getting a better distribution, we need to apply the logarithmic distribution.

In [None]:
transformed_df = EDA_analyzer.apply_logarithmic(df, column_list)
column_list = EDA_analyzer.get_columns_names(transformed_df)
outliers_after = EDA_analyzer.detect_outliers(transformed_df)
EDA_analyzer.plot_outliers(transformed_df, outliers_after)

We can see here that the distribution now ressembles a classic distribution.

In [None]:
EDA_analyzer.show_distribution(transformed_df)

#### 2. e. Plotting the data

In order to better understand the data, we need to plot it with the target variable, which in this case is the price.

In [None]:
EDA_analyzer.plot_graph(df, 'Km', 'pret')

In [None]:
EDA_analyzer.plot_graph(df, 'Putere', 'pret')

In [None]:
EDA_analyzer.plot_graph(df, 'Capacitate cilindrica', 'pret')

In [None]:
EDA_analyzer.plot_graph(df, 'Consum Urban', 'pret')

#### 3. K-Folds Cross Validation

In order to predict how well our model will perform, we need to apply cross-validation on the dataset.

In [None]:
k_fold_cross_validation = KFoldCrossValidation(df)
print(df.columns)
df = k_fold_cross_validation.df
train_buckets = k_fold_cross_validation.leq_range_buckets("pret", [5000, 15000])
for bucket in train_buckets:
    print(
       f"`Bucket: {bucket}` contains {len(train_buckets[bucket])} samples."
       f" Percentage of total: {len(train_buckets[bucket]) / len(df):.2%}"
    )

In [None]:
kfolds = k_fold_cross_validation.make_folds(train_buckets)
print(f"Number of buckets: {len(kfolds)}")
for bucket_name, bucket_kfolds in kfolds.items():
  print(f"Bucket: {bucket_name}")
  for idx, (train_idx, val_idx) in enumerate(bucket_kfolds):
    print(f"Fold: {idx}")
    print(f"Training indices: {train_idx}")
    print(f"Validation indices: {val_idx}")
    print()

In [None]:
n_folds = 5
kfolds = k_fold_cross_validation.make_folds(train_buckets, n_splits=n_folds, shuffle=False)

train_dfs, test_dfs = k_fold_cross_validation.get_train_test_folds(train_buckets, kfolds, n_folds)
print(f"Number of folds: {len(train_dfs)}")
for idx, (train_df, test_df) in enumerate(zip(train_dfs, test_dfs)):
  print(f"Fold: {idx}")
  print(f"Training shape: {train_df.shape}")
  print(f"Testing shape: {test_df.shape}")
  print()

print(df.columns)

In [None]:
k_fold_cross_validation.plot_correlation_with_price()

In [None]:
total_null = df.isnull().sum().sort_values(ascending=False)
percent_null = df.isnull().sum() / df.isnull().count() * 100
percent_null = percent_null.sort_values(ascending=False)
null_data = pd.concat([total_null, percent_null], axis=1, keys=['Total', 'Percent'])
null_data.head(100)

#### 5. Finally, we run our Regression algorithms

In [None]:
num_dataframe = df.select_dtypes(include='number')

n_folds = 5
target = 'pret'
feature_columns = num_dataframe.columns.tolist()
feature_columns.remove('pret')
num_dataframe = num_dataframe.drop(columns='pret')
target = df['pret']
num_dataframe['id'] = car_ids
print(target)
print(num_dataframe.columns)
features = df[feature_columns]

linear_regression = LinearRegression(num_dataframe, features, target, n_folds)



In [None]:
linear_regression.run_regression("Decision Tree")

In [None]:
linear_regression.run_regression("Random Forest")

In [None]:
linear_regression.run_regression("Extra Trees")

In [None]:
linear_regression.run_regression("Gradient Boosting")

In [None]:
import os

output_dir = 'predictions'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### 6. Applying operations on the test dataset

In [None]:
data_cleaner_test = DataCleaning('assets/auto_test_leaderboard.csv')
df_test = data_cleaner_test.df

df_test = data_cleaner_test.delete_columns_with_nans(df_test)
df_test = data_cleaner_test.fill_nan_with_frequent(df_test)

In [None]:
numeric_columns = data_cleaner.df.select_dtypes(include=['number'])
numeric_columns_with_nan = numeric_columns.columns[numeric_columns.isna().any()].tolist()
numeric_df_with_nan = df_test[numeric_columns_with_nan]
numeric_df_with_nan.info()
numeric_dataframe = pd.DataFrame(numeric_df_with_nan)

In [None]:
object_columns = data_cleaner.df.select_dtypes(include=['object'])
object_columns_with_nan = object_columns.columns[object_columns.isna().any()].tolist()
object_df_with_nan = df_test[object_columns_with_nan]
object_df_with_nan.info()
object_dataframe = pd.DataFrame(object_df_with_nan)

In [None]:
df_test = data_cleaner.fill_nan_with_median(df_test)

In [None]:
car_ids_test = df_test["id"]
print(car_ids)
df_test = df_test.drop(columns=['id', 'data'])

num_dataframe = df_test.select_dtypes(include='number')
print(num_dataframe.columns)
n_folds = 5
target = 'pret'
feature_columns = num_dataframe.columns.tolist()
# feature_columns.remove('pret')
# target = df['pret']
num_dataframe['id'] = car_ids
print(target)

new_data_features = df_test[feature_columns]

In [None]:
loaded_model = linear_regression.load_model('Decision Tree_model.pk1')
predictions = LinearRegression.predict(loaded_model, new_data_features)

new_data_predictions = pd.DataFrame({'id': new_data_features['id'], 'value': predictions})

output_filepath = os.path.join(output_dir, 'decision_tree_new_data_predictions.csv')
new_data_predictions.to_csv(output_filepath, index=False)

In [None]:
loaded_model = linear_regression.load_model('Random Forest_model.pk1')
predictions = LinearRegression.predict(loaded_model, new_data_features)

new_data_predictions = pd.DataFrame({'id': new_data_features['id'], 'value': predictions})

output_filepath = os.path.join(output_dir, 'random_forest_new_data_predictions.csv')
new_data_predictions.to_csv(output_filepath, index=False)

In [None]:
loaded_model = linear_regression.load_model('Extra Trees_model.pk1')
predictions = LinearRegression.predict(loaded_model, new_data_features)

new_data_predictions = pd.DataFrame({'id': new_data_features['id'], 'value': predictions})

output_filepath = os.path.join(output_dir, 'extra_trees_new_data_predictions.csv')
new_data_predictions.to_csv(output_filepath, index=False)

In [None]:
loaded_model = linear_regression.load_model('Gradient Boosting_model.pk1')
predictions = LinearRegression.predict(loaded_model, new_data_features)

new_data_predictions = pd.DataFrame({'id': new_data_features['id'], 'value': predictions})

output_filepath = os.path.join(output_dir, 'gradient_boosting_new_data_predictions.csv')
new_data_predictions.to_csv(output_filepath, index=False)