In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
df = pd.read_csv('data.csv', encoding='latin')
df.head()

# 2. Exploratory data analysis

## 2.1. Quick statistical overview

In [None]:
df.describe()

## 2.2. Dealing with types

In [None]:
df.dtypes

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df = df.set_index('InvoiceDate')

## 2.3. Dealing with null values

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(columns=['CustomerID'])

In [None]:
df[df['Description'].isnull()].head()

In [None]:
df['Description'] = df['Description'].fillna('UNKNOWN ITEM')
df.isnull().sum()

## 2.4. Checking out columns separately

In [None]:
df['Description'].value_counts().head()

In [None]:
item_counts = df['Description'].value_counts().sort_values(ascending=False).iloc[0:15]
plt.figure(figsize=(18,6))
sns.barplot(item_counts.index, item_counts.values, palette=sns.cubehelix_palette(15))
plt.ylabel("Counts")
plt.title("Which items were bought more often?");
plt.xticks(rotation=90);

In [None]:
df['Description'].value_counts().tail()

In [None]:
df[~df['Description'].str.isupper()]['Description'].value_counts().head()

In [None]:
lcase_counts = df[~df['Description'].str.isupper()]['Description'].value_counts().sort_values(ascending=False).iloc[0:15]
plt.figure(figsize=(18,6))
sns.barplot(lcase_counts.index, lcase_counts.values, palette=sns.color_palette("hls", 15))
plt.ylabel("Counts")
plt.title("Not full upper case items");
plt.xticks(rotation=90);

In [None]:
df['StockCode'].value_counts().head()

In [None]:
stock_counts = df['StockCode'].value_counts().sort_values(ascending=False).iloc[0:15]
plt.figure(figsize=(18,6))
sns.barplot(stock_counts.index, stock_counts.values, palette=sns.color_palette("GnBu_d"))
plt.ylabel("Counts")
plt.title("Which stock codes were used the most?");
plt.xticks(rotation=90);

In [None]:
df['InvoiceNo'].value_counts().tail()

In [None]:
inv_counts = df['InvoiceNo'].value_counts().sort_values(ascending=False).iloc[0:15]
plt.figure(figsize=(18,6))
sns.barplot(inv_counts.index, inv_counts.values, palette=sns.color_palette("BuGn_d"))
plt.ylabel("Counts")
plt.title("Which invoices had the most items?");
plt.xticks(rotation=90);

In [None]:
df[df['InvoiceNo'].str.startswith('C')].describe()

In [None]:
df = df[~df['InvoiceNo'].str.startswith('C')]

In [None]:
df.describe()

In [None]:
df[df['Quantity'] < 0].head()

In [None]:
df = df[df['Quantity'] > 0]
df.describe()

In [None]:
df[df['UnitPrice'] < 0].describe()

In [None]:
df[df['UnitPrice'] == -11062.06]

In [None]:
df = df[df['UnitPrice'] > 0]
df.describe()

In [None]:
df['Sales'] = df['Quantity'] * df['UnitPrice']
df.head()

# 3. Visual EDA

In [None]:
plt.figure(figsize=(3,6))
sns.countplot(df[df['Country'] == 'United Kingdom']['Country'])
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(df[df['Country'] != 'United Kingdom']['Country'])
plt.xticks(rotation=90)

In [None]:
uk_count = df[df['Country'] == 'United Kingdom']['Country'].count()
all_count = df['Country'].count()
uk_perc = uk_count/all_count
print(str('{0:.2f}%').format(uk_perc*100))

## 3.1. Detecting outliers

In [None]:
plt.figure(figsize=(18,6))
plt.scatter(x=df.index, y=df['Sales'])

In [None]:
df = df[df['Sales'] < 25000]
plt.figure(figsize=(18,6))
plt.scatter(x=df.index, y=df['Sales'])
plt.xticks(rotation=90)

In [None]:
df.quantile([0.05, 0.95, 0.98, 0.99, 0.999])

In [None]:
df_quantile = df[df['Sales'] < 125]
plt.scatter(x=df_quantile.index, y=df_quantile['Sales'])
plt.xticks(rotation=90)

In [None]:
df_quantile.describe()

## 3.2. Visually checking distribution of numeric features

In [None]:
plt.figure(figsize=(12,4))
sns.distplot(df_quantile[df_quantile['UnitPrice'] < 10]['UnitPrice'].values, kde=True, bins=10)

In [None]:
plt.figure(figsize=(12,4))
sns.distplot(df_quantile[df_quantile['UnitPrice'] < 5]['UnitPrice'].values, kde=True, bins=10, color='green')

In [None]:
plt.figure(figsize=(12,4))
sns.distplot(df_quantile[df_quantile['Quantity'] <= 30]['Quantity'], kde=True, bins=10, color='red')

In [None]:
plt.figure(figsize=(12,4))
sns.distplot(df_quantile[df_quantile['Quantity'] <= 15]['Quantity'], kde=True, bins=10, color='orange')

In [None]:
plt.figure(figsize=(12,4))
sns.distplot(df_quantile[df_quantile['Sales'] < 60]['Sales'], kde=True, bins=10, color='purple')

In [None]:
plt.figure(figsize=(12,4))
sns.distplot(df_quantile[df_quantile['Sales'] < 30]['Sales'], kde=True, bins=10, color='grey')

## 3.3. Analysing sales over time

In [None]:
df_ts = df[['Sales']]
df_ts.head()

In [None]:
plt.figure(figsize=(18,6))
df_resample = df_ts.resample('W').sum()
df_resample.plot()

In [None]:
df_resample['12-2010':'01-2011']

# 4. Preparing data for modeling and feature creation

In [None]:
df_clean = df[df['UnitPrice'] < 15]
df_clean.describe()

In [None]:
df_clean.index

## 4.1. Quantity per invoice feature

In [None]:
df_join = df_clean.groupby('InvoiceNo')[['Quantity']].sum()

In [None]:
df_join = df_join.reset_index()
df_join.head()

In [None]:
df_clean['InvoiceDate'] = df_clean.index
df_clean = df_clean.merge(df_join, how='left', on='InvoiceNo')
df_clean = df_clean.rename(columns={'Quantity_x' : 'Quantity', 'Quantity_y' : 'QuantityInv'})
df_clean.tail(15)

In [None]:
df_clean.describe()

In [None]:
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])

In [None]:
df_clean.dtypes

## 4.2. Bucketizing Quantity and UnitPrice features

In [None]:
bins_q = pd.IntervalIndex.from_tuples([(0, 2), (2, 5), (5, 8), (8, 11), (11, 14), (15, 5000)])
df_clean['QuantityRange'] = pd.cut(df_clean['Quantity'], bins=bins_q)
bins_p = pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (2, 3), (3, 4), (4, 20)])
df_clean['PriceRange'] = pd.cut(df_clean['UnitPrice'], bins=bins_p)
df_clean.head()

## 4.3. Extracting and bucketizing dates

In [None]:
df_clean['Month'] = df_clean['InvoiceDate'].dt.month
df_clean.head()

In [None]:
bins_d = pd.IntervalIndex.from_tuples([(0,3),(3,6),(6,9),(9,12)])
df_clean['DateRange'] = pd.cut(df_clean['Month'], bins=bins_d, labels=['q1','q2','q3','q4'])
df_clean.tail()

# 5. Building a model

## 5.1. Splitting data into UK and non-UK

In [None]:
df_uk = df_clean[df_clean['Country'] == 'United Kingdom']
df_abroad = df_clean[df_clean['Country'] != 'United Kingdom']

In [None]:
df_uk.head()

## 5.2. Extracting features and creating dummy variables

In [None]:
df_uk_model = df_uk[['Sales', 'QuantityInv', 'QuantityRange', 'PriceRange', 'DateRange']]
df_uk_model.head()

In [None]:
df_data = df_uk_model.copy()
df_data = pd.get_dummies(df_data, columns=['QuantityRange'], prefix='qr')
df_data = pd.get_dummies(df_data, columns=['PriceRange'], prefix='pr')
df_data = pd.get_dummies(df_data, columns=['DateRange'], prefix='dr')
df_data.head()

## 5.3. Scaling

In [None]:
from sklearn.preprocessing import scale
df_data['QuantityInv'] = scale(df_data['QuantityInv'])

## 5.4. Train-Test Split

In [None]:
y = df_data['Sales']
X = df_data.drop(columns=['Sales'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Turn the values into an array for feeding the classification algorithms.
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

In [None]:
# Let's implement simple classifiers
# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

In [None]:
from sklearn import preprocessing
from sklearn import utils

#convert y values to categorical values
lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y_train)

#view transformed values
print(y_transformed)

In [None]:
from sklearn.model_selection import cross_val_score


for key, classifier in classifiers.items():
    classifier.fit(X_train, y_transformed)
    training_score = cross_val_score(classifier, X_train, y_transformed, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")