In [None]:
#Attempt 1 : No feature selection (all features use)
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

train_df = pd.read_csv('C:/Users/Admin/source/gist-mldl24f-hw3/train.csv')
test_df = pd.read_csv('C:/Users/Admin/source/gist-mldl24f-hw3/test.csv')

print("Training Data Overview:")
print(train_df.head())
print("\nData Information:")
train_df.info()

#numerical features
print("\nData Description:")
print(train_df.describe())

#missing values
print("\nMissing Values in Training Data:")
print(train_df.isnull().sum())

def preprocess_datetime(df):
    if 'dep_time' in df.columns and 'arr_time' in df.columns:
        df['dep_time'] = pd.to_datetime(df['dep_time'], format='%H:%M', errors='coerce')
        df['arr_time'] = pd.to_datetime(df['arr_time'], format='%H:%M', errors='coerce')
        df['dep_hour'] = df['dep_time'].dt.hour
        df['dep_minute'] = df['dep_time'].dt.minute
        df['arr_hour'] = df['arr_time'].dt.hour
        df['arr_minute'] = df['arr_time'].dt.minute
        df.drop(columns=['dep_time', 'arr_time'], inplace=True, errors='ignore')
    return df

def preprocess_time_taken(df):
    if 'time_taken' in df.columns:
        df['time_taken_hours'] = df['time_taken'].str.extract(r'(\d+)h').astype(float).fillna(0)
        df['time_taken_minutes'] = df['time_taken'].str.extract(r'(\d+)m').astype(float).fillna(0)
        df['time_taken_total_minutes'] = (df['time_taken_hours'] * 60) + df['time_taken_minutes']
        df.drop(columns=['time_taken', 'time_taken_hours', 'time_taken_minutes'], inplace=True, errors='ignore')
    return df

def preprocess_categoricals(df):
    label_encoders = {}
    categorical_columns = [col for col in ['airline', 'ch_code', 'from', 'to', 'class'] if col in df.columns]
    for col in categorical_columns:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col].astype(str))
    return df

def preprocess_invalid_columns(df):
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df['date'] = df['date'].view(np.int64) // 10**9 
    if 'stop' in df.columns:
        df['stop'] = LabelEncoder().fit_transform(df['stop'].astype(str))
    return df

train_df = preprocess_datetime(train_df)
train_df = preprocess_time_taken(train_df)
train_df = preprocess_categoricals(train_df)
train_df = preprocess_invalid_columns(train_df)

test_df = preprocess_datetime(test_df)
test_df = preprocess_time_taken(test_df)
test_df = preprocess_categoricals(test_df)
test_df = preprocess_invalid_columns(test_df)

print("\nProcessed Data Overview:")
columns_to_display = [
    'dep_hour', 'dep_minute', 'arr_hour', 'arr_minute', 
    'time_taken_total_minutes', 'airline', 'ch_code', 'from', 'to', 'class', 
    'date', 'stop'
]

columns_to_display = [col for col in columns_to_display if col in train_df.columns]
print(train_df[columns_to_display].head())

X = train_df.drop(columns=['id', 'price'], errors='ignore')
y = train_df['price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

params = {
    'objective': 'regression',
    'metric': 'mape',  
    'boosting_type': 'gbdt',
    'num_leaves': 63,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
}

num_round = 1000
bst = lgb.train(
    params,
    train_data,
    num_boost_round=num_round,
    valid_sets=[val_data],
    valid_names=['validation'],
    callbacks=[lgb.early_stopping(stopping_rounds=50)],
)

y_val_pred = bst.predict(X_val, num_iteration=bst.best_iteration)

test_data = test_df.drop(columns=['id'], errors='ignore')
test_pred = bst.predict(test_data, num_iteration=bst.best_iteration)

submission = pd.DataFrame({
    'id': test_df['id'],
    'price': test_pred
})

submission.to_csv('submissionNew.csv', index=False)

print("Prediction and submission file saved.")

In [None]:
#Attempt 2 : See which features are important with the target variable
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

train_df = pd.read_csv('C:/Users/Admin/source/gist-mldl24f-hw3/train.csv')
test_df = pd.read_csv('C:/Users/Admin/source/gist-mldl24f-hw3/test.csv')

print("Training Data Overview:")
print(train_df.head())
print("\nData Information:")
train_df.info()

def preprocess_datetime(df):
    if 'dep_time' in df.columns and 'arr_time' in df.columns:
        df['dep_time'] = pd.to_datetime(df['dep_time'], format='%H:%M', errors='coerce')
        df['arr_time'] = pd.to_datetime(df['arr_time'], format='%H:%M', errors='coerce')
        df['dep_hour'] = df['dep_time'].dt.hour
        df['dep_minute'] = df['dep_time'].dt.minute
        df['arr_hour'] = df['arr_time'].dt.hour
        df['arr_minute'] = df['arr_time'].dt.minute
        df.drop(columns=['dep_time', 'arr_time'], inplace=True, errors='ignore')
    return df

def preprocess_time_taken(df):
    if 'time_taken' in df.columns:
        df['time_taken_hours'] = df['time_taken'].str.extract(r'(\d+)h').astype(float).fillna(0)
        df['time_taken_minutes'] = df['time_taken'].str.extract(r'(\d+)m').astype(float).fillna(0)
        df['time_taken_total_minutes'] = (df['time_taken_hours'] * 60) + df['time_taken_minutes']
        df.drop(columns=['time_taken', 'time_taken_hours', 'time_taken_minutes'], inplace=True, errors='ignore')
    return df

def preprocess_categoricals(df):
    label_encoders = {}
    categorical_columns = [col for col in ['airline', 'ch_code', 'from', 'to', 'class'] if col in df.columns]
    for col in categorical_columns:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col].astype(str))
    return df

def preprocess_invalid_columns(df):
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df['date'] = df['date'].view(np.int64) // 10**9 
    if 'stop' in df.columns:
        df['stop'] = LabelEncoder().fit_transform(df['stop'].astype(str))
    return df

train_df = preprocess_datetime(train_df)
train_df = preprocess_time_taken(train_df)
train_df = preprocess_categoricals(train_df)
train_df = preprocess_invalid_columns(train_df)

test_df = preprocess_datetime(test_df)
test_df = preprocess_time_taken(test_df)
test_df = preprocess_categoricals(test_df)
test_df = preprocess_invalid_columns(test_df)

print("\nProcessed Data Overview:")
columns_to_display = [
    'dep_hour', 'dep_minute', 'arr_hour', 'arr_minute', 
    'time_taken_total_minutes', 'airline', 'ch_code', 'from', 'to', 'class', 
    'date', 'stop'
]
columns_to_display = [col for col in columns_to_display if col in train_df.columns]
print(train_df[columns_to_display].head())

#Feature selection - All columns except 'id' and 'price' (target variable) - First attempt
#Feature selection - All columns except 'id', 'price' (target variable) ,'dep_minute','dep_hour','ch_code','arr_minute','arr_hour','stop' - Second attempt
X = train_df.drop(columns=['id', 'price'], errors='ignore')
y = train_df['price']

#Split the data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

params = {
    'objective': 'regression',
    'metric': 'mape',  
    'boosting_type': 'gbdt',
    'num_leaves': 63,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
}

num_round = 1000
bst = lgb.train(
    params,
    train_data,
    num_boost_round=num_round,
    valid_sets=[val_data],
    valid_names=['validation'],
    callbacks=[lgb.early_stopping(stopping_rounds=50)],
)

#Feature Analysis
importance = bst.feature_importance(importance_type='gain')
feature_names = X_train.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(importance_df)

plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.show()

#Feature selection - All columns except 'id', 'dep_minute','dep_hour','ch_code','arr_minute','arr_hour','stop' - Second attempt
test_data = test_df.drop(columns=['id','dep_minute','dep_hour','ch_code','arr_minute','arr_hour','stop'], errors='ignore')
test_pred = bst.predict(test_data, num_iteration=bst.best_iteration)

submission = pd.DataFrame({
    'id': test_df['id'],
    'price': test_pred
})

submission.to_csv('submissionNewNew.csv', index=False)
print("Prediction and submission file saved.")