In [None]:
import pandas as pd
data=pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
df = data

In [None]:
print(data.describe())

In [None]:
legal_df = data[data['Class']==0]
illegal_df = data[data['Class']==1]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, axs = plt.subplots(1, 2, figsize=(20, 6))
sns.histplot(legal_df['Amount'], bins=50, kde=True, ax=axs[0])
axs[0].set_title('Legal Transaction Amount Distribution')
sns.histplot(illegal_df['Amount'], bins=50, kde=True, ax=axs[1])
axs[1].set_title('Illegal Transaction Amount Distribution')
plt.show()



In [None]:
correlation_matrix = df.corr()
correlation_with_class = correlation_matrix['Class'].drop('Class')
plt.figure(figsize=(10, 6))
correlation_with_class.plot(kind='bar', color='skyblue')
plt.title('Correlation with Class')
plt.xlabel('Features')
plt.ylabel('Correlation Coefficient')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
print("Correlation Coefficients with Class:")
print(correlation_with_class[correlation_with_class > 0])


In [None]:
correlation_matrix = df.corr()
correlation_with_class = correlation_matrix['Class'].drop('Class')
positive_correlation_features = correlation_with_class[correlation_with_class > 0].index.tolist()
correlation_matrix_filtered = correlation_matrix.loc[positive_correlation_features, positive_correlation_features]
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_filtered, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Features with Most Correlation to Class')
plt.show()

In [None]:
correlation_matrix = df.corr()

most_correlated_features = correlation_matrix['Class'].sort_values(key=abs, ascending=False)

plt.figure(figsize=(14, 8))
sns.heatmap(correlation_matrix.loc[['Class'], most_correlated_features.index[:20]], annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Heatmap of the Top Correlated Features with Class')
plt.show()

most_correlated_features.head(20)


In [None]:
Delta_Time = pd.to_timedelta(df['Time'], unit='s')

#new derived columns
df['Time_Day'] = (Delta_Time.dt.components.days).astype('int')
df['Time_Hour'] = (Delta_Time.dt.components.hours).astype('int')
df['Time_Min'] = (Delta_Time.dt.components.minutes).astype('int')

In [None]:
df.to_csv("Time_df.csv")

In [None]:
plt.figure(figsize=(18, 16))

# Boxplot
plt.subplot(2, 2, 1)
sns.boxplot(x='Class', y='Amount', data=df)
plt.title('Transaction Amount by Class')

# Time by Day vs Amount
plt.subplot(2, 2, 2)
sns.scatterplot(x='Time_Day', y='Amount', hue='Class', data=df, alpha=0.6)
plt.title('Transaction Amount vs. Day')

# Time by Hour vs Amount
plt.subplot(2, 2, 3)
sns.scatterplot(x='Time_Hour', y='Amount', hue='Class', data=df, alpha=0.6)
plt.title('Transaction Amount vs. Hour')

# Time by Minute vs Amount
plt.subplot(2, 2, 4)
sns.scatterplot(x='Time_Min', y='Amount', hue='Class', data=df, alpha=0.6)
plt.title('Transaction Amount vs. Minute')

plt.tight_layout()
plt.show()

amount_stats_by_class = df.groupby('Class')['Amount'].describe()
amount_stats_by_class


In [None]:
firstday_df = df[df["Time_Day"] == 0]
second_day = df[df["Time_Day"] == 1]
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
ax1.plot(firstday_df['Time_Hour'] + firstday_df['Time_Min'] / 60, firstday_df['Amount'], label='Line 1')
ax1.set_title('First day Transactions')
ax1.set_xlabel('Time (Hours)')
ax1.set_ylabel('Amount')
ax2.plot(second_day['Time_Hour'] + second_day['Time_Min'] / 60, second_day['Amount'], label='Line 2')
ax2.set_title('Second day Transactions')
ax2.set_xlabel('Time (Hours)')
ax2.set_ylabel('Amount')
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 6))
sns.histplot(firstday_df['Amount'], bins=50, kde=True, ax=axs[0])
axs[0].set_title('First Day Amount Distribution')
sns.histplot(second_day['Amount'], bins=50, kde=True, ax=axs[1])
axs[1].set_title('Second Day Amount Distribution')
plt.show()


In [None]:
for i in data.columns:
  plt.plot(data[i])
  plt.title(i)
  plt.show()
  print(data[i].describe())
  sns.scatterplot(x = i,y = "Amount",data = data)
  plt.title(f"Scatter plot between {i} and Amount")
  plt.show()
  print(data[i].describe())


In [None]:
from imblearn.over_sampling import SMOTE

X = data.drop('Class', axis=1)
y = data['Class']

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=3)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)




In [None]:
final_model = XGBClassifier(learning_rate=0.2, max_depth=7, n_estimators=300, use_label_encoder=False, eval_metric='logloss')
final_model.fit(X_train, y_train)

final_predictions = final_model.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, final_predictions))
print(classification_report(y_test, final_predictions))
print(accuracy_score(y_test, final_predictions))

In [None]:
from xgboost import plot_importance
plot_importance(final_model)
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score

model_cv = XGBClassifier(learning_rate=0.2, max_depth=7, n_estimators=300, use_label_encoder=False, eval_metric='logloss')
scores = cross_val_score(model_cv, X_res, y_res, cv=10, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Average cross-validation score:", scores.mean())

In [None]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_res, y_res, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.15, random_state=42)

model_final = XGBClassifier(learning_rate=0.2, max_depth=7, n_estimators=300, use_label_encoder=False, eval_metric='logloss')
model_final.fit(X_train, y_train)

y_val_pred = model_final.predict(X_val)
y_val_proba = model_final.predict_proba(X_val)[:, 1]

from sklearn.metrics import accuracy_score, roc_auc_score
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation ROC-AUC:", roc_auc_score(y_val, y_val_proba))


In [None]:
y_test_pred = final_model.predict(X_test)
y_test_proba = final_model.predict_proba(X_test)[:, 1]

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test ROC-AUC:", roc_auc_score(y_test, y_test_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))


In [None]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=final_model, n_features_to_select=10, step=1)
rfe = rfe.fit(X_train, y_train)

print("Feature Ranking:", rfe.ranking_)
