### Analizing and cleaning the data

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
# Load the data from the uploaded files
X_train = pd.read_csv('X_train_78VdSWL.csv')
X_test = pd.read_csv('X_test_XKVc4no.csv')
y_train = pd.read_csv('y_train_u0UkKEh.csv')

In [None]:
# Sizes of the data files
X_train.shape, X_test.shape, y_train.shape

In [None]:
# Show the data
display(X_train.head())
display(X_test.head())
display(y_train.head())

In [None]:
# Fill the missing values in X_train by those of y_train
X_train.fillna(y_train, inplace=True)

In [None]:
# Find columns with missing values
missing_cols = X_train.columns[X_train.isnull().sum() > 0]
missing_cols

In [None]:
# Remove ID= '7986541023', 'holed_192' from the data
X_train = X_train.drop(columns=["7986541023", "holed_192"])  

The first columns of the X_test can be considered as training data as well, the test data are the last 1000 columns of X_test

In [None]:
# Change the index 
X_test.set_index(X_test.columns[0], inplace=True)
X_train.set_index(X_train.columns[0], inplace=True)

# Split X_test into two DataFrames
test_data = X_test.iloc[:, -1000:]  # Last 1000 columns
X_test_training = X_test.iloc[:, :-1000]  # The rest of the columns

In [None]:
# Show the new data
display(X_train.head())
display(X_test_training.head())
display(test_data.head())

In [None]:
# Find columns with missing values
missing_cols = X_test_training.columns[X_test_training.isnull().sum() > 0]
missing_cols

In [None]:
# Remove the columns with missing values
X_test_training = X_test_training.drop(columns=['1250968743', '1753428906', '7132659048', '7813042569'])  

Encode the holed_ columns of X_train before merging it with X_test_training

In [None]:
# Get the last 999 column names
last_999_cols = X_train.columns[-999:]

# Create a dictionary mapping columns to object labels (2001-2999)
column_mapping = {col: str(2001 + i) for i, col in enumerate(last_999_cols)}

# Rename the columns in the DataFrame
X_train = X_train.rename(columns=column_mapping)

Before merging X_train and X_test_training we would like to check that there is no commun ID between them

In [None]:
# Analyse the ID's
# Extract column names
train_col = X_train.columns.tolist()  # Get column names as a list
test_col = X_test_training.columns.tolist()  # Get column names as a list

In [None]:
# Check if they have common elements
if set(train_col).isdisjoint(set(test_col)):  
    print("No common elements!")  
else:
    print("They have common elements!")  

Merge all the training data 

In [None]:
# Merge them on the index (horizontally)
merged_df = pd.concat([X_train, X_test_training], axis=1)

# Save the merged file of the training data
merged_df.to_csv("merged_train.csv", index=True)

# Save the test data
test_data.to_csv("test_data.csv", index=True)

In [None]:
# Display the new data
display(merged_df.head())
display(test_data.head())