In [None]:
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import numpy as np

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
import plotly.express as px
import plotly.graph_objects as go
from plotly import tools
from plotly.subplots import make_subplots
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()
import warnings
warnings.filterwarnings('ignore')
from wordcloud import WordCloud, STOPWORDS
import zipfile


# Data Loading

In [None]:
from google.colab import files

uploaded = files.upload()

# Make sure the uploaded file is named kaggle.json
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d mehmettahiraslan/customer-shopping-dataset


In [None]:
zip_file_path = '/content/customer-shopping-dataset.zip'

# Specify the directory to extract the dataset
extraction_path = '/content'

os.makedirs(extraction_path, exist_ok=True)

# Extract the dataset
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

In [None]:
df=pd.read_csv('/content/customer_shopping_data.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
df.drop(columns=['invoice_no'], inplace=True)

# The most visited shopping malls

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

# Define the base color
base_color = '#ff0000'  # You can change this to any color you prefer

# Generate different shades of the base color
num_colors = len(df['shopping_mall'].unique())
colors = [base_color] + [cm.spring(i / num_colors) for i in range(1, num_colors)]

# Plotting a donut chart for shopping mall visit counts with custom colors
plt.figure(figsize=(8, 8))
plt.pie(df['shopping_mall'].value_counts(), labels=df['shopping_mall'].value_counts().index, autopct='%1.2f%%', startangle=90, wedgeprops=dict(width=0.4), colors=colors)
plt.title('The most visited shopping malls')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()


# Gender distribution

In [None]:
import matplotlib.pyplot as plt

# Define custom colors for 'male' and 'female'
custom_colors = ['#4169E1', '#800080']  # Blue for male, Purple for female

# Plotting a pie chart for gender distribution with custom colors
plt.figure(figsize=(8, 8))
df['gender'].value_counts().plot(kind='pie', autopct='%1.2f%%', colors=custom_colors)
plt.title('Gender distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()


!! the Women have been Shopping a lot over man

# Distribution by age groups


In [None]:
def age_group_(x):
    if x <= 20 :
        x= '18-20yo'
    elif x >20 and x <=30:
        x= '21-30yo'
    elif x >30 and x <=40:
        x= '31-40yo'
    elif x >40 and x <=50:
        x= '41-50yo'
    elif x >50 and x <=60:
        x= '51-60yo'
    else:
        x= '61yo+'
    return x

In [None]:
df['age_group']=df['age'].apply(age_group_)
df.groupby('age_group')['gender'].agg(['count'])

In [None]:
import matplotlib.pyplot as plt

# Plotting a bar plot for age distribution
plt.figure(figsize=(10, 6))
df['age'].value_counts().sort_index().plot(kind='bar', color='skyblue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Define age groups
age_bins = [17, 20, 30, 40, 50, 60, 100]
age_labels = ['18-20yo', '21-30yo', '31-40yo', '41-50yo', '51-60yo', '61yo+']

# Group ages into bins
age_groups = pd.cut(df['age'], bins=age_bins, labels=age_labels)

# Plotting a bar plot for age distribution
plt.figure(figsize=(10, 6))
age_groups.value_counts().sort_index().plot(kind='bar', color='skyblue')
plt.title('Age Distribution For Better View')
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Define age groups
age_bins = [17, 20, 30, 40, 50, 60, 100]
age_labels = ['18-20yo', '21-30yo', '31-40yo', '41-50yo', '51-60yo', '61yo+']

# Group ages into bins
age_groups = pd.cut(df['age'], bins=age_bins, labels=age_labels)

# Count 'male' and 'female' within each age group
gender_counts = df.groupby([age_groups, 'gender']).size().unstack().fillna(0)

# Plotting a stacked bar plot for age distribution with different colors for male and female
plt.figure(figsize=(10, 6))
gender_counts.plot(kind='bar', stacked=True, color=['#4169E1', '#800080'])
plt.title('Age Distribution by Gender')
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Gender')
plt.tight_layout()
plt.show()


# Distribution of purchase categories relative to gender

In [None]:
df.pivot_table(index='gender',columns='category',values='age_group', aggfunc=(['count']))

In [None]:
fig = px.histogram(df, x='gender',color='category',marginal='box',hover_data=df.columns,title= 'Distribution of purchase categories relative to gender', width=1200, height=600,template='plotly_dark')
fig.show()

# Distribution of purchase categories relative to category and age

In [None]:
df.groupby('quantity')['category'].agg(['count']).sort_values(by='count',ascending=False)

In [None]:
df.pivot_table(index='quantity',columns='category',values='age_group', aggfunc=(['count']))

In [None]:
df.pivot_table(index='quantity',columns='category',values='age_group', aggfunc=(['count'])).plot(kind='bar', figsize=(10,10), title= 'Distribution of purchase categories relative to category and age')
plt.show()

# Distribution relative to the payment method

In [None]:
df.groupby('payment_method')['age_group'].agg(['count'])

In [None]:
import matplotlib.pyplot as plt

# Define custom colors for 'credit' and 'debit'
custom_colors = ['#3CB371', '#FF6347']  # Green for credit, Red for debit

# Group by payment method and count age groups
payment_counts = df.groupby('payment_method')['age_group'].count()

# Plotting a pie chart for payment method distribution with custom colors
plt.figure(figsize=(8, 8))
payment_counts.plot(kind='pie', autopct='%1.1f%%', colors=custom_colors)
plt.title('Payment Method Distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()


idk why the code doesn't want to add the two debit with each other ,
**but it's equal 64.9% All**

In [None]:
plt.figure(figsize=(18,12))
sns.scatterplot(data=df, x='price', y='payment_method', hue='age_group', legend=True)
plt.xticks(rotation=90)
plt.title('Distribution relative to the payment method',fontsize = 20)
plt.show()

# Distribution Age with Categories

In [None]:
df.pivot_table(index='age_group',columns='category',values='payment_method', aggfunc=(['count']))

In [None]:
sns.displot(df, x='age_group',bins=70, hue='category',aspect=2.9).set(title='Distribution of product categories relative to age groups')
plt.xticks(rotation=90)
plt.show()

# Dist. of avr. Price

In [None]:
df.groupby('category')['price'].agg(['mean','sum']).sort_values(by='mean',ascending=False)

In [None]:
plt.subplots(figsize=(20,15))
plt.xticks(rotation=90)
sns.boxplot(x='category', y='price', data=df)
plt.show()

In [None]:
fig = px.histogram(df, x='age', y='price', color='shopping_mall',barmode='relative',marginal='box', title='Distribution of the price of goods in shopping centers and the costs of buyers', width=1200, height=600 ,template='plotly_dark')
fig.show()

In [None]:
df['price'] = df['price'].astype(int)

In [None]:
df

In [None]:
from sklearn import svm, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
from matplotlib import style


In [None]:
df2=df.copy()

In [None]:
df2['customer_id']=pd.factorize(df2.customer_id)[0]
df2['gender']=pd.factorize(df2.gender)[0]
df2['category']=pd.factorize(df2.category)[0]
df2['payment_method']=pd.factorize(df2.payment_method)[0]
df2['shopping_mall']=pd.factorize(df2.shopping_mall)[0]
df2['age_group']=pd.factorize(df2.age_group)[0]
df2['invoice_date']=pd.factorize(df2.invoice_date)[0]

In [None]:
df2['price'] = pd.to_numeric(df2['price'], errors='coerce')

In [None]:
x = df2.drop('price',axis=1)
y = df2['price']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=43, test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42,min_samples_leaf=100)

In [None]:
rfc.fit(x_train,y_train)

In [None]:
y_pred = rfc.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)

In [None]:
print('Accuracy:',accuracy)

In [None]:
df2['Prediction']=rfc.predict(x)

In [None]:
df2.head(10)

In [None]:
fig=px.line(df2,x='invoice_date',y=['price','Prediction'],template='ggplot2')
fig.show()

In [None]:
import plotly.graph_objects as go
import plotly.express as px

# Assuming df2 is your DataFrame containing 'invoice_date', 'price', and 'Prediction' columns

fig = go.Figure()

# Add the actual price data
fig.add_trace(go.Scatter(x=df2['invoice_date'], y=df2['price'], mode='lines', name='Actual Price'))

# Add the predicted price data
fig.add_trace(go.Scatter(x=df2['invoice_date'], y=df2['Prediction'], mode='lines', name='Predicted Price'))

# Update the layout
fig.update_layout(title='Actual vs. Predicted Prices Over Time',
                  xaxis_title='Invoice Date',
                  yaxis_title='Price',
                  template='plotly_dark')  # You can choose a different template if needed

# Show the figure
fig.show()
