In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp312-cp312-win_amd64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp312-cp312-win_amd64.whl (101.1 MB)
   ---------------------------------------- 0.0/101.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.1 MB ? eta -:--:--
   ---------------------------------------- 0.1/101.1 MB 656.4 kB/s eta 0:02:34
   ---------------------------------------- 0.1/101.1 MB 1.2 MB/s eta 0:01:23
   ---------------------------------------- 0.3/101.1 MB 1.6 MB/s eta 0:01:04
   ---------------------------------------- 0.4/101.1 MB 1.9 MB/s eta 0:00:54
   ---------------------------------------- 0.6/101.1 MB 2.0 MB/s eta 0:00:50
   ---------------------------------------- 0.7/101.1 MB 2.2 MB/s eta 0:00:47
   ---------------------------------------- 0.8/101.1 MB 2.3 MB/s eta 0:00:44
   ---------------------------------------- 0.9/101.1 MB 2.3 MB/s eta 0:00:44
   ---------------------------------------- 1.0/101.1 MB 2.2 MB/s eta 0:00:


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

# Load the data
train_df = pd.read_csv('flight_delays_train.csv')
test_df = pd.read_csv('flight_delays_test.csv')
submission_df = pd.read_csv('sample_submission.csv')


print(train_df.isnull().sum())
print(test_df.isnull().sum())

# Combine train and test data to handle unseen categories
combined_df = pd.concat([train_df, test_df], sort=False)

# Encode Categorical Features
categorical_features = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']

label_encoder = LabelEncoder()
for col in categorical_features:
    combined_df[col] = label_encoder.fit_transform(combined_df[col])

# Split back into train and test sets
train_df = combined_df.iloc[:len(train_df), :]
test_df = combined_df.iloc[len(train_df):, :]

# Feature Scaling
scaler = StandardScaler()
train_df[['DepTime', 'Distance']] = scaler.fit_transform(train_df[['DepTime', 'Distance']])
test_df[['DepTime', 'Distance']] = scaler.transform(test_df[['DepTime', 'Distance']])

# Split the Training Data into features (X) and the target (y)
X_train = train_df.drop(columns=['dep_delayed_15min'])
y_train = train_df['dep_delayed_15min'].apply(lambda x: 1 if x == 'Y' else 0)

# Optional: Split into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Check shapes to ensure alignment
print(X_train.shape, y_train.shape)

Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
UniqueCarrier        0
Origin               0
Dest                 0
Distance             0
dep_delayed_15min    0
dtype: int64
Month            0
DayofMonth       0
DayOfWeek        0
DepTime          0
UniqueCarrier    0
Origin           0
Dest             0
Distance         0
dtype: int64
(100000, 8) (100000,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[['DepTime', 'Distance']] = scaler.fit_transform(train_df[['DepTime', 'Distance']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[['DepTime', 'Distance']] = scaler.transform(test_df[['DepTime', 'Distance']])


In [8]:
# Initialize and train the CatBoost model
model = CatBoostClassifier(cat_features=categorical_features,iterations=3000, loss_function='Logloss',
                               l2_leaf_reg=0.8, od_type='Iter',
                               random_seed=17, silent=True)
model.fit(X_train_split, y_train_split)

# Validate the model if needed
y_val_pred = model.predict_proba(X_val_split)[:, 1]
roc_auc = roc_auc_score(y_val_split, y_val_pred)
print(f'Validation ROC AUC: {roc_auc:.4f}')

# Prepare the test data for prediction
test_df_features = test_df.copy()

# Make Predictions on the Test Set
test_predictions = model.predict_proba(test_df_features)[:, 1]

# Ensure that 'submission_df' has an 'id' column and 'dep_delayed_15min' column
# Align the test predictions with the 'id' column in the submission file
submission_df['dep_delayed_15min'] = test_predictions

# Save the Submission File
submission_df.to_csv('submission2.csv', index=False)

print("Submission file created successfully.")

Validation ROC AUC: 0.7729
Submission file created successfully.


In [20]:
# Select and Train the Model
model = RandomForestClassifier(random_state=42, n_estimators=10000, max_depth=20, n_jobs=None,criterion='log_loss',max_features='log2', class_weight=None)
model.fit(X_train_split, y_train_split)

# Evaluate the Model using ROC AUC
val_predictions = model.predict_proba(X_val_split)[:, 1]
val_roc_auc = roc_auc_score(y_val_split, val_predictions)
print(f'Validation ROC AUC: {val_roc_auc}')

# You can also evaluate using cross-validation if needed
# from sklearn.model_selection import cross_val_score
# cv_scores = cross_val_score(model, X_train_split, y_train_split, cv=5, scoring='roc_auc')
# print(f'Mean ROC AUC (cross-validation): {np.mean(cv_scores)}')
#0.7373741807278432 20
#0.738192506513157 20 gini log2
#0.7413548802321208 20 log_loss log2
#0.7434947240442774 200 log_loss log2

Validation ROC AUC: 0.7462692404664005


In [21]:
# Prediction on the Test Set
# Ensure that 'dep_delayed_15min' column is dropped from the test set
test_df_features = test_df.drop(columns=['dep_delayed_15min'])

# Make Predictions on the Test Set
test_predictions = model.predict_proba(test_df_features)[:, 1]

# Prepare the Submission File
# Assuming the test dataset has an identifier for each flight, we'll create an 'id' column
# If no ID is provided, we'll generate a sequential ID.

# Prepare the Submission File
submission_df['dep_delayed_15min'] = test_predictions
submission_df.to_csv('submission_submit.csv', index=False)

print("Submission file created successfully.")


Submission file created successfully.


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Load the dataset
train_data = pd.read_csv('flight_delays_train.csv')
test_data = pd.read_csv('flight_delays_test.csv')

# Preprocess the data
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(train_data[['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']])

# Combine encoded features with other numerical features
X = pd.concat([pd.DataFrame(encoded_features.toarray()), train_data[['DepTime', 'Distance']]], axis=1)
y = train_data['dep_delayed_15min']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred)
print(f'ROC AUC: {roc_auc}')

# Make predictions on the test set
encoded_test_features = encoder.transform(test_data[['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']])
X_test = pd.concat([pd.DataFrame(encoded_test_features.toarray()), test_data[['DepTime', 'Distance']]], axis=1)
test_predictions = model.predict_proba(X_test)[:, 1]

# Prepare the submission file
submission = pd.DataFrame({'id': test_data.index, 'dep_delayed_15min': test_predictions})
submission.to_csv('submission1.csv', index=False)


TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
train_df = pd.read_csv('flight_delays_train.csv')
test_df = pd.read_csv('flight_delays_test.csv')

# Combine train and test for preprocessing
all_data = pd.concat([train_df, test_df], axis=0, sort=False)

# Identify categorical and numerical columns
categorical_columns = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']
numerical_columns = ['DepTime', 'Distance']

# Create preprocessing steps
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Fit the preprocessor on all data
all_features = preprocessor.fit_transform(all_data)

# Get feature names
onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_feature_names = onehot_encoder.get_feature_names_out(categorical_columns)
feature_names = numerical_columns + cat_feature_names.tolist()

# Convert to DataFrame
all_features_df = pd.DataFrame(all_features.toarray(), columns=feature_names)

# Split back into train and test
X = all_features_df[:len(train_df)]
X_test = all_features_df[len(train_df):]
y = train_df['dep_delayed_15min']

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on validation set
y_val_pred = model.predict_proba(X_val)[:, 1]

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'Validation ROC AUC: {roc_auc:.4f}')

# Make predictions on test set
test_predictions = model.predict_proba(X_test)[:, 1]

# Create submission file
submission = pd.DataFrame({'id': range(len(test_predictions)), 'dep_delayed_15min': test_predictions})
submission.to_csv('submission11.csv', index=False)

Validation ROC AUC: 0.7376


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Load the data
train_df = pd.read_csv('flight_delays_train.csv')
test_df = pd.read_csv('flight_delays_test.csv')

# Preprocess the data
categorical_cols = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']
numerical_cols = ['DepTime', 'Distance']

ct = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('scaler', StandardScaler(), numerical_cols)
])

X_train = ct.fit_transform(train_df.drop('dep_delayed_15min', axis=1))
y_train = train_df['dep_delayed_15min']

X_test = ct.transform(test_df)

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, y_pred)
print("ROC AUC Score:", auc_score)

# Make predictions on the test data
test_pred = model.predict_proba(X_test)[:, 1]

# Save the predictions to a submission file
submission_df = pd.DataFrame({'id': test_df.index, 'dep_delayed_15min': test_pred})
submission_df.to_csv('submission111.csv', index=False)

ROC AUC Score: 0.7141869047119445


KeyError: 'id'