In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## About Dataset


### Context

Price optimization is using historical data to identify the most appropriate price of a product or a service that maximizes the company’s profitability. There are numerous factors like demography, operating costs, survey data, etc that play a role in efficient pricing, it also depends on the nature of businesses and the product that is served. The business regularly adds/upgrades features to bring more value to the product and this obviously has a cost associated with it in terms of effort, time, and most importantly companies reputation.

As a result, it is important to understand the correct pricing, a little too high, you lose your customers and slight underpricing will result in loss of revenue. Price optimization helps businesses strike the right balance of efficient pricing, achieving profit objectives, and also serve their customers.

### Content

* The data contains the demand and corresponding average unit price at a product - month_year level

* Tasks
* Exploratory data analysis
* Data visualization
* Demand forecasting
* Price optimization


## Veri Kümesi Hakkında

### Bağlam

Fiyat optimizasyonu, şirketin karlılığını en üst düzeye çıkaran bir ürün veya hizmetin en uygun fiyatını belirlemek için geçmiş verileri kullanmaktır. Verimli fiyatlandırmada rol oynayan demografi, işletme maliyetleri, anket verileri vb. gibi çok sayıda faktör vardır ve bu aynı zamanda işletmelerin doğasına ve hizmet verilen ürüne de bağlıdır. İşletme, ürüne daha fazla değer katmak için düzenli olarak özellikler ekler/yükseltir ve bunun açıkça çaba, zaman ve en önemlisi şirketin itibarı açısından bir maliyeti vardır.

Sonuç olarak, doğru fiyatlandırmayı anlamak önemlidir, biraz fazla yüksek olursa müşterilerinizi kaybedersiniz ve biraz düşük fiyatlandırma gelir kaybına neden olur. Fiyat optimizasyonu, işletmelerin verimli fiyatlandırma, kâr hedeflerine ulaşma ve ayrıca müşterilerine hizmet etme arasında doğru dengeyi kurmasına yardımcı olur.

### İçerik

* Veriler, bir ürün - ay_yıl düzeyinde talebi ve buna karşılık gelen ortalama birim fiyatı içerir

* Görevler
* keşifsel veri analizi
* Veri goruntuleme
* talep tahmini
* Fiyat optimizasyonu

## Analysis Content 

* 1.[Python Libraries](#1)
* 2.[data loading](#2)
* 3.[EDA](#3)
* 4.[data Preprocessing](#4)
* 5.[Modelling](#5) 

<a id="1"></a>
## 1.Python Libraries

In [2]:
#Let's load the relevant libraries (İlgili kütüphaneleri yükleyelim);

import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import Ridge,Lasso,RidgeCV,LassoCV,ElasticNet,ElasticNetCV,LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import neighbors
from sklearn.svm import SVR
import warnings
warnings.filterwarnings("ignore")

<a id="2"></a>
## 2.Data Loading

In [3]:
df=pd.read_csv("/kaggle/input/retail-price-optimization/retail_price.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/retail-price-optimization/retail_price.csv'

In [None]:
df.tail()

<a id="3"></a>
## 3.EDA

In [None]:
df.shape

In [None]:
# Our dataset consists of 676 observations and 30 attributes
# Verisetimiz 676 gözlem ve 30 öznitelikten oluşmaktadır

In [None]:
#columns

df.columns

In [None]:
df.dtypes

In [None]:
# structural information

df.info()

* Our dataset consists of 27 numeric, 3 categorical variables, no missing values


*  Verisetimiz 27 sayısal ,3 kategorik değişkenden oluşuyor,eksik değer yok



In [None]:
#unique value counts

df.nunique()

In [None]:
df.drop("product_id",axis=1,inplace=True)

In [None]:
# describe

df.describe([0.05,0.1,0.25,0.35,0.5,0.65,0.75,0.9,0.95]).T

In [None]:
df.sort_values("total_price",ascending=False).head(15) #total price highest

In [None]:
df.sort_values("total_price",ascending=True).head(15) #total price lowest

<a id="4"></a>
## 4.Data Preprocessing

In [None]:
df.isna().sum()

In [None]:
# let's look at the distribution states of numeric variables;
# sayısal değişkenlerin dağılım durumlarına bakalım;

In [None]:
k=1
plt.figure(figsize=(12,12))

for i in df.select_dtypes("int"):
    plt.subplot(9,3,k)
    sns.distplot(df[i])
    plt.title(i)
    k+=1
    plt.tight_layout()



*  Let's try 2 different approaches, 1st let's not touch the outliers, 2nd remove the outliers and make a copy, then compare

*  2 farklı yaklaşım deneyelim,1.si aykırı değerlere dokunmayalım,2.si aykırı değerleri kaldırıp bir kopyasını alalım,daha sonra karşılaştıralım

In [None]:
df.describe([0.05,0.1,0.25,0.35,0.5,0.65,0.75,0.9,0.95]).T

In [None]:
df_outlier_remove=df.copy()

In [None]:
outlier_list=["qty","customers","comp_1","comp_3"]


for i in df_outlier_remove.loc[:,outlier_list]:
    Q1 = df_outlier_remove[i].quantile(0.10)
    Q3 = df_outlier_remove[i].quantile(0.90)
    IQR = Q3-Q1
    up = Q3 + 1.5*IQR
    low = Q1 - 1.5*IQR

    if df_outlier_remove[(df_outlier_remove[i] > up) | (df_outlier_remove[i] < low)].any(axis=None):
        print(i,"yes")
    else:
        print(i, "no")

In [None]:
# We detected outliers in columns gty and comp_1

# gty ve comp_1 kolonlarında aykırı değerler tespit ettik

In [None]:
#reach outliers;

def outliers_train(df_outlier_remove):
    q1,q3=np.percentile(df_outlier_remove,[10,90])
    ıqr=q3-q1
    low,high=q1-1.5*(ıqr),q3+1.5*(ıqr)
    outliers_train=[i for i in df_outlier_remove if i<low or i>high]
    return outliers_train


In [None]:
outliers_train(df.qty)

In [None]:
# There are 9 outliers (for qty)
# 9 tane aykırı değer var (qty için)

In [None]:
outliers_train(df.comp_1)

In [None]:
# 3 tane aykırı değerimiz var(comp_1)
# We have 3 outliers (comp_1)

In [None]:
# let's eliminate outliers
# aykırı değerleri ortadan kaldıralım

In [None]:
# remove outliers;
for i in df_outlier_remove.loc[:,outlier_list]:
    
    Q1 = df_outlier_remove[i].quantile(0.10)
    Q3 = df_outlier_remove[i].quantile(0.90)
    IQR = Q3 - Q1
    up_lim=Q3+1.5 *IQR
    low_lim=Q1-1.5 *IQR
    
    df_outlier_remove.loc[df_outlier_remove[i]>up_lim,i]=up_lim
    df_outlier_remove.loc[df_outlier_remove[i]<low_lim,i]=low_lim

In [None]:
for i in df_outlier_remove.loc[:,outlier_list]:
    Q1 = df_outlier_remove[i].quantile(0.10)
    Q3 = df_outlier_remove[i].quantile(0.90)
    IQR = Q3-Q1
    up = Q3 + 1.5*IQR
    low = Q1 - 1.5*IQR

    if df_outlier_remove[(df_outlier_remove[i] > up) | (df_outlier_remove[i] < low)].any(axis=None):
        print(i,"yes")
    else:
        print(i, "no")

In [None]:
# aykırı değerleri ortadan kaldırdık
# we eliminated outliers

### Time Series Analysis

In [None]:
from datetime import datetime

In [None]:
df["month_year"] = pd.to_datetime(df.month_year)

In [None]:
df["day"]=df["month_year"].dt.day

In [None]:
df.head()

In [None]:
custgroup=df.groupby('month_year').mean()
plt.figure(figsize=(12,5))
custgroup['lag_price'].plot(x=df.month_year)
plt.title("lag_price status")
plt.show()

In [None]:
custgroup=df.groupby('day').mean()
fig,ax=plt.subplots(figsize=(12,5))
ax.xaxis.set(ticks=range(0,31))
custgroup['lag_price'].plot(x=df.month_year)
plt.title("lag_price status by day")
plt.show()


In [None]:
# month_year kolonunu atalım;

df.drop("month_year",axis=1,inplace=True)
df.head(2)

In [None]:
# let's do the same updates in df_remove_outlier
# aynı güncellemeleri df_remove_outlier içinde yapalım

In [None]:
df_outlier_remove["month_year"] = pd.to_datetime(df_outlier_remove.month_year)
df_outlier_remove["day"]=df_outlier_remove["month_year"].dt.day
df_outlier_remove.drop("month_year",axis=1,inplace=True)
df_outlier_remove.head(2)


### Data Visualize

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(df.product_category_name);

In [None]:
sns.pairplot(df.select_dtypes("int"));

In [None]:
df.hist(figsize=(9,9));

### Encoding

In [None]:
## One Hot Encoding İşlemi;

dms=pd.get_dummies(df["product_category_name"])
dms


In [None]:
df.drop("product_category_name",axis=1,inplace=True)

In [None]:
df=pd.concat([df,dms],axis=1)

In [None]:
df.head()

In [None]:
# remove outlier for;

dms2=pd.get_dummies(df_outlier_remove["product_category_name"])
df_outlier_remove.drop("product_category_name",axis=1,inplace=True)
df_outlier_remove=pd.concat([df_outlier_remove,dms2],axis=1)

### Corelation Analysis

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(),annot=True,linewidths=0.7,fmt=".2f",cmap="coolwarm")
plt.show()


In [None]:
cor=df.corr()["lag_price"].sort_values(ascending=False)
pd.DataFrame({"column":cor.index,"Correlation with lag_price":cor.values})


In [None]:
df.drop(["unit_price","day"],axis=1,inplace=True)

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(),annot=True,linewidths=0.7,fmt=".2f",cmap="coolwarm")
plt.show()


In [None]:
# df_remove outlier;

df_outlier_remove.drop(["unit_price","day"],axis=1,inplace=True)

In [None]:
#df3;


<a id="5"></a>
## 5.Modelling

In [None]:
X=df.drop("lag_price",axis=1)
y=df["lag_price"] 


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)


In [None]:
!pip install catboost


In [None]:
!pip install lightgbm


In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [None]:
ridge=Ridge().fit(X_train,y_train)
lasso=Lasso().fit(X_train,y_train)
enet=ElasticNet().fit(X_train,y_train)
knn=KNeighborsRegressor().fit(X_train,y_train)
ada=AdaBoostRegressor().fit(X_train,y_train)
svm=SVR().fit(X_train,y_train)
mlpc=MLPRegressor().fit(X_train,y_train)
dtc=DecisionTreeRegressor().fit(X_train,y_train)
rf=RandomForestRegressor().fit(X_train,y_train)
xgb=XGBRegressor().fit(X_train,y_train)
gbm=GradientBoostingRegressor().fit(X_train,y_train)
lgb=LGBMRegressor().fit(X_train,y_train)
catbost=CatBoostRegressor().fit(X_train,y_train)

In [None]:
models=[ridge,lasso,enet,knn,ada,svm,mlpc,dtc,rf,xgb,gbm,lgb,catbost]

def ML(y,models):
    accuary=models.score(X_train,y_train)
    return accuary


In [None]:
for i in models:
     print(i,"Algorithm succed rate :",ML("survived",i))



* # çok yüksek başarı oranları verdi,optimizasyon işlemi yapmamıza gerek kalmadı.

* # gave very high success rates, no need for optimization