In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Read the data from the excel file
train_data = pd.read_excel("ConsumerComplaints_train.xlsx")
test_data = pd.read_excel("ConsumerComplaints_test.xlsx")

# Check data types
print("Data types of train data:")
print(train_data.dtypes)
print("\nData types of test data:")
print(test_data.dtypes)

# Missing value analysis
print("Missing values in train data:")
print(train_data.isnull().sum())
print("\nMissing values in test data:")
print(test_data.isnull().sum())

# Extract day, month, year from Date Received column
train_data['Date Received'] = pd.to_datetime(train_data['Date Received'])
train_data['Year_Received'] = train_data['Date Received'].dt.year
train_data['Month_Received'] = train_data['Date Received'].dt.month
train_data['Day_Received'] = train_data['Date Received'].dt.day

# Calculate number of days the complaint was with the company
train_data['Days_held'] = (train_data['Date Sent to Company'] - train_data['Date Received']).dt.days

# Drop unnecessary columns
train_data.drop(columns=['Date Received', 'Date Sent to Company', 'ZIP code', 'Complaint ID'], inplace=True)

# Impute null values in State by mode
train_data['State'].fillna(train_data['State'].mode()[0], inplace=True)

# Create Week_Received based on the day of receiving
train_data['Week_Received'] = train_data['Day_Received'] // 7 + 1

# Store data of disputed people
disputed_cons = train_data[train_data['Consumer disputed?'] == 'Yes']

# Plot bar graph of total no of disputes of consumers
sns.countplot(x='Consumer disputed?', data=train_data)
plt.show()

# Plot bar graph of total no of disputes products-wise
sns.countplot(x='Product', hue='Consumer disputed?', data=train_data)
plt.xticks(rotation=45)
plt.show()

# Plot bar graph of total no of disputes with top issues
top_issues = train_data['Issue'].value_counts().head(10).index
sns.countplot(y='Issue', hue='Consumer disputed?', data=train_data, order=top_issues)
plt.show()

# Continue plotting other bar graphs...

# Convert negative Days_held to zero
train_data['Days_held'] = np.where(train_data['Days_held'] < 0, 0, train_data['Days_held'])

# Drop unnecessary columns for model building
train_data.drop(columns=['Company', 'State', 'Year_Received', 'Days_held'], inplace=True)

# Change Consumer disputed Column to 0 and 1
train_data['Consumer disputed?'] = train_data['Consumer disputed?'].map({'Yes': 1, 'No': 0})

# Create Dummy Variables for categorical features
train_data = pd.get_dummies(train_data, columns=['Product', 'Submitted via', 'Company response to consumer', 'Timely response?'])

# Standardize the Data Sets
scaler = StandardScaler()
X = scaler.fit_transform(train_data.drop(columns=['Consumer disputed?']))
y = train_data['Consumer disputed?']

# Make feature selection with PCA
pca = PCA(n_components=0.8)
X_pca = pca.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Build models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    print(f"{name}: Train Accuracy - {train_accuracy}, Test Accuracy - {test_accuracy}")

# Use the best model to predict the outcome for the test file
best_model = RandomForestClassifier()
best_model.fit(X_train, y_train)
test_data = pd.get_dummies(test_data, columns=['Product', 'Submitted via', 'Company response to consumer', 'Timely response?'])
test_data_pca = pca.transform(scaler.transform(test_data))
predictions = best_model.predict(test_data_pca)
test_data['Consumer disputed?'] = predictions
test_data.to_excel("ConsumerComplaints_test_predictions.xlsx", index=False)
