In [1]:
from __future__ import division
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('result.csv')

In [3]:
df.rename(columns ={'Unnamed: 0': 'new column name'}, inplace = True)

In [4]:
df.drop(["new column name"], axis = 1, inplace = True)

In [5]:
df.head()

Unnamed: 0,Date,sales,debit,credit,customer_name
0,2011-04-01,Opening Balance,5940.0,0.0,"COMMUNITY HEALTH CENTRE, PALLUR"
1,2011-04-04,Cash,0.0,5940.0,"COMMUNITY HEALTH CENTRE, PALLUR"
2,2011-04-01,Opening Balance,86484.0,0.0,"DENTSALES,KANNUR."
3,2011-04-04,Cash,0.0,19500.0,"DENTSALES,KANNUR."
4,2011-04-15,Cash,0.0,19500.0,"DENTSALES,KANNUR."


In [6]:
df['gross_total'] = df['debit'] - df['credit']

In [7]:
df['balance'] = df.groupby('customer_name')['gross_total'].cumsum()

In [8]:
df.head()

Unnamed: 0,Date,sales,debit,credit,customer_name,gross_total,balance
0,2011-04-01,Opening Balance,5940.0,0.0,"COMMUNITY HEALTH CENTRE, PALLUR",5940.0,5940.0
1,2011-04-04,Cash,0.0,5940.0,"COMMUNITY HEALTH CENTRE, PALLUR",-5940.0,0.0
2,2011-04-01,Opening Balance,86484.0,0.0,"DENTSALES,KANNUR.",86484.0,86484.0
3,2011-04-04,Cash,0.0,19500.0,"DENTSALES,KANNUR.",-19500.0,66984.0
4,2011-04-15,Cash,0.0,19500.0,"DENTSALES,KANNUR.",-19500.0,47484.0


In [9]:
df.drop(["gross_total"], axis = 1, inplace = True)

In [10]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [11]:
df.head()

Unnamed: 0,Date,sales,debit,credit,customer_name,balance
0,2011-04-01,Opening Balance,5940.0,0.0,"COMMUNITY HEALTH CENTRE, PALLUR",5940.0
1,2011-04-04,Cash,0.0,5940.0,"COMMUNITY HEALTH CENTRE, PALLUR",0.0
2,2011-04-01,Opening Balance,86484.0,0.0,"DENTSALES,KANNUR.",86484.0
3,2011-04-04,Cash,0.0,19500.0,"DENTSALES,KANNUR.",66984.0
4,2011-04-15,Cash,0.0,19500.0,"DENTSALES,KANNUR.",47484.0


In [12]:
df['days'] = -(df.groupby('customer_name')['Date'].diff(periods=-1))

In [13]:
df.head()

Unnamed: 0,Date,sales,debit,credit,customer_name,balance,days
0,2011-04-01,Opening Balance,5940.0,0.0,"COMMUNITY HEALTH CENTRE, PALLUR",5940.0,3 days
1,2011-04-04,Cash,0.0,5940.0,"COMMUNITY HEALTH CENTRE, PALLUR",0.0,NaT
2,2011-04-01,Opening Balance,86484.0,0.0,"DENTSALES,KANNUR.",86484.0,3 days
3,2011-04-04,Cash,0.0,19500.0,"DENTSALES,KANNUR.",66984.0,11 days
4,2011-04-15,Cash,0.0,19500.0,"DENTSALES,KANNUR.",47484.0,12 days


In [14]:
df['days'] = df['days'].apply(lambda x: x.days)

In [15]:
df.head()

Unnamed: 0,Date,sales,debit,credit,customer_name,balance,days
0,2011-04-01,Opening Balance,5940.0,0.0,"COMMUNITY HEALTH CENTRE, PALLUR",5940.0,3.0
1,2011-04-04,Cash,0.0,5940.0,"COMMUNITY HEALTH CENTRE, PALLUR",0.0,
2,2011-04-01,Opening Balance,86484.0,0.0,"DENTSALES,KANNUR.",86484.0,3.0
3,2011-04-04,Cash,0.0,19500.0,"DENTSALES,KANNUR.",66984.0,11.0
4,2011-04-15,Cash,0.0,19500.0,"DENTSALES,KANNUR.",47484.0,12.0


In [16]:
df = df.fillna(0)

In [17]:
df['CEP'] = df['balance']/df['days']

In [18]:
df.replace([np.inf, -np.inf], np.nan, inplace=True) 

In [19]:
df = df.fillna(0)

In [20]:
df.head()

Unnamed: 0,Date,sales,debit,credit,customer_name,balance,days,CEP
0,2011-04-01,Opening Balance,5940.0,0.0,"COMMUNITY HEALTH CENTRE, PALLUR",5940.0,3.0,1980.0
1,2011-04-04,Cash,0.0,5940.0,"COMMUNITY HEALTH CENTRE, PALLUR",0.0,0.0,0.0
2,2011-04-01,Opening Balance,86484.0,0.0,"DENTSALES,KANNUR.",86484.0,3.0,28828.0
3,2011-04-04,Cash,0.0,19500.0,"DENTSALES,KANNUR.",66984.0,11.0,6089.454545
4,2011-04-15,Cash,0.0,19500.0,"DENTSALES,KANNUR.",47484.0,12.0,3957.0


## Clusters based on debit

In [21]:
df_debit = df.groupby('customer_name')['debit'].sum()

In [22]:
df_debit.head()

customer_name
13TH MID-TERM CONFERENCE, KANNUIR SALES      20920.0
32 CARROT(Dr)                                47609.0
38 TH ISPPD - SALES                         267425.0
38 TH ISPPD SALES-1                         248590.0
47TH KERALA STATE DENTAL CONFERENCE        2437937.0
Name: debit, dtype: float64

In [23]:
df_debit.to_csv('df_debit.csv')

In [24]:
df_debit = pd.read_csv('df_debit.csv', names=["customer_name", "debit"])

In [25]:
df_debit.head()

Unnamed: 0,customer_name,debit
0,"13TH MID-TERM CONFERENCE, KANNUIR SALES",20920.0
1,32 CARROT(Dr),47609.0
2,38 TH ISPPD - SALES,267425.0
3,38 TH ISPPD SALES-1,248590.0
4,47TH KERALA STATE DENTAL CONFERENCE,2437937.0


In [26]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_debit[['debit']])
df_debit['debitCluster'] = kmeans.predict(df_debit[['debit']])

In [27]:
df_debit.groupby('debitCluster')['debit'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
debitCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,6300.0,125910.7,242577.4,0.0,5747.5,28306.0,122711.1,1909736.0
1,1.0,107381600.0,,107381600.0,107381600.0,107381600.0,107381600.0,107381600.0
2,8.0,26966690.0,8999422.0,16367060.0,22199160.0,25131350.0,29828040.0,46047890.0
3,86.0,3695258.0,1886313.0,1951488.0,2319251.0,2965140.0,4345924.0,10964990.0


In [28]:
df_count = df_debit['debitCluster'].value_counts()

In [29]:
df_count.head()

0    6300
3      86
2       8
1       1
Name: debitCluster, dtype: int64

In [30]:
df_count.to_csv('debit_count.csv')

In [31]:
df_count = pd.read_csv('debit_count.csv',names=['cluster','count'])

In [32]:
df_count.head()

Unnamed: 0,cluster,count
0,0,6300
1,3,86
2,2,8
3,1,1


In [33]:
k=[]
for value in df_count['count']:
    k.append(value)


In [34]:
i=0
while(i<=4):
    if(df_count['count'][i]==max(k)):
        c = df_count['cluster'][i]
        break
    else:
        i = i+1

In [35]:
c

0

In [36]:
s = df_count['count'].sum()

In [37]:
df_clusters = df_debit[df_debit['debitCluster']!=c]

In [38]:
df_clusters.head()

Unnamed: 0,customer_name,debit,debitCluster
4,47TH KERALA STATE DENTAL CONFERENCE,2437937.0,3
5,48TH KSDC - ALOHA 16,4147800.0,3
6,49TH IDA KSDC - SALES,2872103.0,3
7,50TH IDA KSDC - SALES,2717340.14,3
8,51ST KSDC - SALES,3568090.0,3


In [39]:
df_clusters['debitCluster']=df_clusters['debitCluster'].replace({1:0,2:1,3:2})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [40]:
df_debitc = df_debit[df_debit['debitCluster']==c]

In [None]:
df_clusters.groupby('debitCluster')['debit'].describe()

In [None]:
df_debitc.groupby('debitClusterc')['debit'].describe()

In [None]:
df_debitc.head()

In [None]:
df_clusters.head()

In [None]:
df_debitc.head()

In [41]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_debitc[['debit']])
df_debitc['debitClusterc'] = kmeans.predict(df_debitc[['debit']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [42]:
df_debitc['debitClusterc']=df_debitc["debitClusterc"].replace({0:3,1:4,2:5,3:6})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
df_debitc.head()

Unnamed: 0,customer_name,debit,debitCluster,debitClusterc
0,"13TH MID-TERM CONFERENCE, KANNUIR SALES",20920.0,0,4
1,32 CARROT(Dr),47609.0,0,4
2,38 TH ISPPD - SALES,267425.0,0,6
3,38 TH ISPPD SALES-1,248590.0,0,6
9,A R LAB SYSTEM ERANJIPALAM,10812.0,0,4


In [None]:
df_debitc.drop()

In [None]:
def order_cluster(cluster_field_name, target_field_name,df_,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df_.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df_,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

In [None]:
df_debit0 = order_cluster('debitCluster0', 'debit',df_debit0,False)

In [None]:
df_d0.head()

In [None]:
from sklearn.cluster import KMeans
sse={}
df_d0 = df_debit0[['debit']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_d0)
    df_d0["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()



In [None]:
from sklearn.cluster import KMeans
sse={}
df_dd0 = df_debit[['debit']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_dd0)
    df_dd0["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()


In [None]:
df_debit['debit_norm']=(df_debit['debit']-df_debit['debit'].min())/(df_debit['debit'].max()-df_debit['debit'].min())

In [None]:
df_debit.head()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_debit[['debit_norm']])
df_debit['debitClusterNorm'] = kmeans.predict(df_debit[['debit_norm']])

In [None]:
df_debit.groupby('debitClusterNorm')['debit_norm'].describe()

In [None]:
df.shape

In [None]:
df.groupby('debitCluster')['debit'].describe()

In [None]:
df.head()

In [None]:
df['debit_norm']=(df['debit']-df['debit'].min())/(df['debit'].max()-df['debit'].min())

In [None]:
df.head()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(df[['debit_norm']])
df['debitClusterNorm'] = kmeans.predict(df[['debit_norm']])

In [None]:
df.groupby('debitClusterNorm')['debit_norm'].describe()

## CEP clustering

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(df[['CEP']])
df['CEPCluster'] = kmeans.predict(df[['CEP']])

In [None]:
df.groupby('CEPCluster')['CEP'].describe()