In [218]:
import pandas as pd
import matplotlib.pyplot as plt 

In [219]:
df = pd.read_excel('data/raw/claims-data-2015-as-of-feb-9-2016.xlsx')
df

Unnamed: 0,Claim Number,Date Received,Incident D,Airport Code,Airport Name,Airline Name,Claim Type,Claim Site,Item Category,Close Amount,Disposition
0,2015060222904,2015-05-28,2015-05-21,ABE,"Lehigh Valley International Airport, Allentown",Allegiant Air,Property Damage,Checkpoint,Personal Electronics,0,Deny
1,2015082725115,2015-08-21,2015-08-01,ABE,"Lehigh Valley International Airport, Allentown",-,Property Damage,Checked Baggage,Sporting Equipment & Supplies,180,Settle
2,2015102326219,2015-10-07,2015-10-02,ABE,"Lehigh Valley International Airport, Allentown",Allegiant Air,Passenger Property Loss,Checked Baggage,Clothing,60,Approve in Full
3,2015122027695,2015-10-27,2015-10-19,ABE,"Lehigh Valley International Airport, Allentown",Allegiant Air,Property Damage,Checkpoint,Personal Accessories,-,-
4,2015012220065,2015-01-22,2014-12-23,ABI,Abilene Regional,American Airlines,Property Damage,Checked Baggage,Home Decor,0,Deny
...,...,...,...,...,...,...,...,...,...,...,...
8662,2015120427297,2015-11-20,2015-10-16,-,-,-,Property Damage,Checked Baggage,"Baggage/Cases/Purses; Books, Magazines & Other...",-,-
8663,2015123027969,2015-12-17,2015-12-02,-,-,-,Property Damage,Checked Baggage,Audio/Video; Home Decor,-,-
8664,2016010428072,2015-12-22,2015-12-20,-,-,-,Passenger Property Loss,Checked Baggage,Clothing,-,-
8665,2016011328300,2015-12-30,2015-12-28,-,-,-,Passenger Property Loss,Checked Baggage,Tools & Home Improvement Supplies,-,-


In [220]:
top_items = df['Item Category'].value_counts().head(10).index
df['Item Category'] = df['Item Category'].apply(lambda x: x if x in top_items else 'Other')

In [221]:
df['Disposition'] = df['Disposition'].map(lambda x: 1 if x == 'Deny' else 0)
df['Disposition'].value_counts()

Disposition
0    5093
1    3574
Name: count, dtype: int64

In [222]:
df['Close Amount'] = pd.to_numeric(df['Close Amount'], errors='coerce')
df = df.dropna().copy()

In [223]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df[~df.isin(['-']).any(axis=1)].copy()

In [224]:
top_airlines = df['Airline Name'].value_counts().head(5).index
df = df[df['Airline Name'].isin(top_airlines)].copy()

In [225]:
columns_to_drop = ['Claim Number', 'Airport Code', 'Airport Name', 'Date Received', 'Incident D']
df = df.drop(columns=columns_to_drop).copy()

In [226]:
df

Unnamed: 0,Airline Name,Claim Type,Claim Site,Item Category,Close Amount,Disposition
4,American Airlines,Property Damage,Checked Baggage,Other,0.00,1
5,American Airlines,Passenger Property Loss,Checked Baggage,Baggage/Cases/Purses,200.00,0
6,American Airlines,Passenger Property Loss,Checked Baggage,Personal Electronics,0.00,1
7,Delta Air Lines,Passenger Property Loss,Checked Baggage,Clothing,0.00,1
9,Southwest Airlines,Passenger Property Loss,Checked Baggage,Other,136.49,0
...,...,...,...,...,...,...
8563,USAir,Passenger Property Loss,Checked Baggage,Other,40.00,0
8564,USAir,Passenger Property Loss,Checked Baggage,Other,149.99,0
8565,USAir,Passenger Property Loss,Checked Baggage,Computer & Accessories,0.00,1
8568,USAir,Passenger Property Loss,Checked Baggage,Other,106.99,0


In [227]:
from sklearn.model_selection import train_test_split

X = df.drop('Disposition', axis=1)
y = df['Disposition']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [228]:
num_cols = X_train.select_dtypes(include='number').columns.tolist()
cat_cols = X_train.select_dtypes(include='object').columns.tolist()

In [229]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])


In [230]:
X_train_set = full_pipeline.fit_transform(X_train)
X_test_set = full_pipeline.transform(X_test)

In [231]:


# Converter X_train_set para DataFrame
X_train_df = pd.DataFrame(X_train_set.toarray() if hasattr(X_train_set, "toarray") else X_train_set)
X_train_df.head(10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.103905,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.392829,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.392829,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.559765,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.392829,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.392829,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,2.773527,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,5.918918,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,-0.392829,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-0.057003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [232]:
import numpy as np

np.save("data/processed/X_train.npy", X_train_set)
y_train.to_csv("data/processed/y_train.csv", index=False)

np.save("data/processed/X_test.npy", X_test_set)
y_test.to_csv("data/processed/y_test.csv", index=False)