In [1]:
import pandas as pd
import pyodbc
import numpy as np

In [2]:
import sqlCredentials as sql

In [3]:
proc_db = pyodbc.connect(
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'Server=52.86.56.66;'
    'Database=PROCUREMENTDB;'
    'UID='+sql.username+';'
    'PWD='+sql.password+';'
    'Trusted_connection=no;'
)

In [4]:
pub_dict = """
SELECT *
FROM dbo.PublisherDictionary
"""

In [5]:
df = pd.read_sql(pub_dict,proc_db)

In [6]:
df.head()

Unnamed: 0,PublisherLong,PublisherShort
0,"""Abrams, Inc.""",ABRAMS
1,"""Addison-Wesley Longman, Incorporated""",PEARSON
2,"""Addison-Wesley Longman, Limited""",PEARSON
3,"""Allyn & Bacon, Incorporated""",PEARSON
4,"""American Occupational Therapy Association, In...","American Occupational Therapy Association, Inc..."


In [7]:
df.dtypes

PublisherLong     object
PublisherShort    object
dtype: object

In [8]:
megalist = """
SELECT *
FROM Process.SupplierMegaList
"""

In [None]:
megalist = pd.read_sql(megalist,proc_db)

In [None]:
megalist.head()

In [None]:
megalist.dtypes

In [None]:
megalist.columns

In [None]:
sup_list = megalist[['ISBN', 'Supplier', 'Publisher', 'Currency','ListPrice','Discount','CostUnitPrice', 'UnitShippingCost', 'MaxQtyPerOrder']]

In [None]:
sup_list.head()

In [None]:
sup_list['price'] = np.where(sup_list['ListPrice'] == 0.00, sup_list['CostUnitPrice'], sup_list['ListPrice'])

In [None]:
sup_list.head()

In [None]:
sup_list.drop(columns = ['ListPrice', 'CostUnitPrice'], inplace = True)

In [None]:
sup_list

In [None]:
sup_list['Supplier'].unique()

In [None]:
def clean_up(phrase):
    phrase = phrase.lstrip()
    phrase = phrase.rstrip()
    phrase = phrase.upper()
    return phrase

In [None]:
str_columns = ['ISBN', 'Supplier', 'Publisher', 'Currency']

for col in str_columns:
    sup_list[col] = sup_list.apply(lambda x: clean_up(x[col]), axis =1)
    
sup_list['Supplier'].unique()

In [None]:
for col in str_columns:
    print(sup_list[col].unique())

In [None]:
# We have too many Publihser's name, we have to merge with the dictionary,
#Left Join would do the work
df.head()

In [None]:
for col in list(df.columns):
    df[col] = df.apply(lambda x: clean_up(x[col]), axis =1)

In [None]:
df['PublisherShort'].unique()

In [None]:
#We merge our supplier table with the publisher dictionary
sup_list2 = pd.merge(sup_list, df, how='left', left_on = 'Publisher', right_on = 'PublisherLong' )

In [None]:
sup_list2

In [None]:
sup_list2.drop(columns = ['PublisherLong'], inplace = True)

In [None]:
print('# of pubs in PublishertShort Column: ', len(sup_list2['PublisherShort'].unique()))
print('# of pubs in Publisher Column: ', len(sup_list2['Publisher'].unique()))

In [None]:
sup_list2.head()

In [None]:
sup_list2.dtypes

In [None]:
print('PublisherShort Empty Cells: ',sup_list2['PublisherShort'].isna().sum())
print('Publisher Empty Cells:', sup_list2['Publisher'].isna().sum())

In [None]:
#Fill Na in the PusblisherShort Column
sup_list2['Pub_noNA'] = np.where(sup_list2['PublisherShort'].isna(), sup_list2['Publisher'],sup_list2['PublisherShort'])

In [None]:
sup_list2['Pub_noNA'].isna().sum()

In [None]:
sup_list2['Pub_noNA'].unique()

In [None]:
# We need to add Supplier extra costs conditions and clean the Publisher Name
sup_list2['Invoice_Fee'] = np.where(
    sup_list2['Supplier'].str.contains('ALEK'), 0.005,
    np.where(
        (sup_list2['Supplier'].str.contains('LAURENTIU')) | (sup_list2['Supplier'].str.contains('SENAD')),0.02,
        0))

In [None]:
sup_list2.head()

In [None]:
sup_list2['lp_Fee'] = np.where(
    (sup_list2['Supplier'].str.contains('BILLSON') & (sup_list2['Discount'] == 0 )), 0.03,0)

In [None]:
sup_list2.head()

In [None]:
#change headers
sup_list2.columns = map(str.lower, sup_list2.columns)

In [None]:
sup_list2.head()

In [None]:
#Drop uneccesary columns
sup_list2.drop(columns = ['publisher', 'publishershort'], inplace = True)

In [None]:
sup_list2.head()

In [None]:
#rename header
sup_list2.rename(columns={'pub_nona': 'pub'}, inplace = True)
sup_list2.head()

In [None]:
sup_list2['inv_price_before_exc'] = sup_list2['price'] * (
    1 - sup_list2['discount'] + sup_list2['lp_fee'])*(
    1 + sup_list2['invoice_fee'])

In [None]:
sup_list2.head()

In [None]:
#We add the Exchange rate column
exchange_rate = """
SELECT *
FROM dbo.XChange
"""

In [None]:
exch_df = df = pd.read_sql(exchange_rate,proc_db)

In [None]:
exch_df

In [None]:
#check that there is no empy space 
sup_list2.isna().sum()

In [None]:
# We merge the exchange rate
sup_list3 = pd.merge(sup_list2, exch_df, how='left', left_on = 'currency', right_on = 'name' )

In [None]:
sup_list3.head()

In [None]:
#drop uneccesary columns that were added after the merge
sup_list3.drop(columns = ['id', 'name', 'updatedon'], inplace = True)

In [None]:
sup_list3.head()

In [None]:
sup_list3['landed_cost'] = sup_list3['inv_price_before_exc'] * sup_list3['rate'] + sup_list3['unitshippingcost']

In [None]:
sup_list3.head()

In [None]:
sup_bestcost = sup_list3[['isbn', 'supplier','pub', 'currency','rate','price', 'discount','invoice_fee','lp_fee','inv_price_before_exc','unitshippingcost', 'landed_cost', 'maxqtyperorder']].sort_values(by='landed_cost', ascending=True)

In [None]:
sup_bestcost

In [None]:
# lets remoove any title that do not have price
sup_bestcost.drop(sup_bestcost[sup_bestcost['price'] == 0].index, inplace = True)

In [None]:
sup_bestcost