Notebook is being used to review the website: 
https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sas.html

The article helps to show how procedures and data steps within SAS can be translated into Python script

In [1]:
# Import packages
import pandas as pd
import numpy as np

In [2]:
# Import csv files
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
#url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
df = pd.read_csv(url, sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
# Displaying the first N rows of the DataFrame. Leaving the parenthesis () empty after referencing the DataFrame reference (df) defaults to 5 rows being printed.
# Adding a value into the parenthesis outputs the value of rows requested
# df.head()
# Take a sample of rows to review
df.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1427,7.6,0.41,0.33,2.5,0.078,6.0,23.0,0.9957,3.3,0.58,11.2,5
650,10.7,0.43,0.39,2.2,0.106,8.0,32.0,0.9986,2.89,0.5,9.6,5
1444,7.3,0.67,0.02,2.2,0.072,31.0,92.0,0.99566,3.32,0.68,11.1,6
831,5.9,0.61,0.08,2.1,0.071,16.0,24.0,0.99376,3.56,0.77,11.1,6
1042,8.9,0.5,0.21,2.2,0.088,21.0,39.0,0.99692,3.33,0.83,11.1,6


In [4]:
# Displays the shape of the DataFrame (# rows, # columns)
df.shape

(1599, 12)

In [5]:
# Print the list of columns within a DataFrame
print(df.columns)

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


In [6]:
# Select a number of columns to display
df[["fixed acidity", "alcohol", "quality"]].head()

Unnamed: 0,fixed acidity,alcohol,quality
0,7.4,9.4,5
1,7.8,9.8,5
2,7.8,9.8,5
3,11.2,9.8,6
4,7.4,9.4,5


In [13]:
# Retain the columns to display within a list
sel_col = ["fixed acidity", "citric acid",  "alcohol", "quality", "density", "pH"]
df[sel_col].head()

Unnamed: 0,fixed acidity,citric acid,alcohol,quality,density,pH
0,7.4,0.0,9.4,5,0.9978,3.51
1,7.8,0.0,9.8,5,0.9968,3.2
2,7.8,0.04,9.8,5,0.997,3.26
3,11.2,0.56,9.8,6,0.998,3.16
4,7.4,0.0,9.4,5,0.9978,3.51


In [14]:
# Filter for the columns in a DataFrame
df1 = df[sel_col]

In [15]:
# Display the datatype of all the variables in a DataFrame
df1.dtypes

fixed acidity    float64
citric acid      float64
alcohol          float64
quality            int64
density          float64
pH               float64
dtype: object

In [16]:
# Displaying the cardinality of each column
print(df1.apply(lambda col: col.nunique()))

fixed acidity     96
citric acid       80
alcohol           65
quality            6
density          436
pH                89
dtype: int64


In [17]:
# Summarise the columns in the DataFrame. By default only the numeric variables will be reviewed
df1.describe()

Unnamed: 0,fixed acidity,citric acid,alcohol,quality,density,pH
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.270976,10.422983,5.636023,0.996747,3.311113
std,1.741096,0.194801,1.065668,0.807569,0.001887,0.154386
min,4.6,0.0,8.4,3.0,0.99007,2.74
25%,7.1,0.09,9.5,5.0,0.9956,3.21
50%,7.9,0.26,10.2,6.0,0.99675,3.31
75%,9.2,0.42,11.1,6.0,0.997835,3.4
max,15.9,1.0,14.9,8.0,1.00369,4.01


In [18]:
# Summarise all variables in the DataFrame
df1.describe(include='all')

Unnamed: 0,fixed acidity,citric acid,alcohol,quality,density,pH
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.270976,10.422983,5.636023,0.996747,3.311113
std,1.741096,0.194801,1.065668,0.807569,0.001887,0.154386
min,4.6,0.0,8.4,3.0,0.99007,2.74
25%,7.1,0.09,9.5,5.0,0.9956,3.21
50%,7.9,0.26,10.2,6.0,0.99675,3.31
75%,9.2,0.42,11.1,6.0,0.997835,3.4
max,15.9,1.0,14.9,8.0,1.00369,4.01


In [19]:
# Rename variable's
df2 = df1.rename(columns = {"fixed acidity":"fixed_acidity", "citric acid":"citric_acid"}, inplace=False)
df2.head()

Unnamed: 0,fixed_acidity,citric_acid,alcohol,quality,density,pH
0,7.4,0.0,9.4,5,0.9978,3.51
1,7.8,0.0,9.8,5,0.9968,3.2
2,7.8,0.04,9.8,5,0.997,3.26
3,11.2,0.56,9.8,6,0.998,3.16
4,7.4,0.0,9.4,5,0.9978,3.51


In [20]:
# Create a new variables
df2['half_pH'] = df2['pH'] / 2
df2.head()

Unnamed: 0,fixed_acidity,citric_acid,alcohol,quality,density,pH,half_pH
0,7.4,0.0,9.4,5,0.9978,3.51,1.755
1,7.8,0.0,9.8,5,0.9968,3.2,1.6
2,7.8,0.04,9.8,5,0.997,3.26,1.63
3,11.2,0.56,9.8,6,0.998,3.16,1.58
4,7.4,0.0,9.4,5,0.9978,3.51,1.755


In [21]:
# Aggregations
# Convert the variable to a numpy array to allow for quicker processing
ph = np.array(df2['pH'])

print("Mean pH: ", round(ph.mean(),1))
print("SD pH: ", ph.std())
print("Minimum pH: ", ph.min())
print("Max pH: ", ph.max())

Mean pH:  3.3
SD pH:  0.15433818141060165
Minimum pH:  2.74
Max pH:  4.01


In [22]:
# If/then logic
df2['high_pH'] = np.where(df2['pH'] > 3.4, 'high', 'low')
df2.sample(5)

Unnamed: 0,fixed_acidity,citric_acid,alcohol,quality,density,pH,half_pH,high_pH
811,12.9,0.55,10.9,6,1.00012,3.09,1.545,low
816,9.8,0.19,10.5,6,0.9984,3.23,1.615,low
182,7.2,0.02,9.3,5,0.9972,3.44,1.72,high
700,10.6,0.43,10.1,6,0.9984,3.08,1.54,low
864,7.2,0.06,9.5,5,0.99746,3.51,1.755,high


In [23]:
# Dates functionality
df2['date1'] = pd.Timestamp('2013-01-15')
df2['date2'] = pd.Timestamp('2015-02-15')
df2['date1_year'] = df2['date1'].dt.year
df2['date2_month'] = df2['date2'].dt.month
df2['date1_next'] = df2['date1'] + pd.offsets.MonthBegin()
df2.sample(5)

Unnamed: 0,fixed_acidity,citric_acid,alcohol,quality,density,pH,half_pH,high_pH,date1,date2,date1_year,date2_month,date1_next
240,8.9,0.37,9.3,5,0.9971,3.0,1.5,low,2013-01-15,2015-02-15,2013,2,2013-02-01
671,8.2,0.21,9.5,5,0.9968,3.2,1.6,low,2013-01-15,2015-02-15,2013,2,2013-02-01
1215,8.8,0.46,11.3,6,0.99488,3.26,1.63,low,2013-01-15,2015-02-15,2013,2,2013-02-01
627,8.8,0.29,9.1,5,0.9988,3.36,1.68,low,2013-01-15,2015-02-15,2013,2,2013-02-01
997,5.6,0.0,12.8,7,0.99378,3.71,1.855,high,2013-01-15,2015-02-15,2013,2,2013-02-01


In [24]:
# Selection of columns
# keep
df2[['fixed_acidity','alcohol','quality']].sample(5)

Unnamed: 0,fixed_acidity,alcohol,quality
200,9.6,10.3,7
329,10.7,9.5,5
476,9.3,9.7,5
413,9.9,11.7,7
842,10.6,10.9,6


In [25]:
# drop
df2.drop('density', axis=1).sample(5)

Unnamed: 0,fixed_acidity,citric_acid,alcohol,quality,pH,half_pH,high_pH,date1,date2,date1_year,date2_month,date1_next
815,10.8,0.33,10.8,5,3.24,1.62,low,2013-01-15,2015-02-15,2013,2,2013-02-01
997,5.6,0.0,12.8,7,3.71,1.855,high,2013-01-15,2015-02-15,2013,2,2013-02-01
201,8.8,0.48,9.3,5,3.04,1.52,low,2013-01-15,2015-02-15,2013,2,2013-02-01
1333,9.1,0.22,9.6,5,3.18,1.59,low,2013-01-15,2015-02-15,2013,2,2013-02-01
1290,7.6,0.0,11.0,5,3.36,1.68,low,2013-01-15,2015-02-15,2013,2,2013-02-01


In [26]:
# rename, >>> oldname:newname
df2.rename(columns={'alcohol':'content'}).sample(5)

Unnamed: 0,fixed_acidity,citric_acid,content,quality,density,pH,half_pH,high_pH,date1,date2,date1_year,date2_month,date1_next
1421,7.5,0.18,9.4,5,0.9965,3.34,1.67,low,2013-01-15,2015-02-15,2013,2,2013-02-01
880,9.2,0.18,9.9,5,0.99576,3.15,1.575,low,2013-01-15,2015-02-15,2013,2,2013-02-01
1587,5.8,0.11,10.9,6,0.99483,3.55,1.775,high,2013-01-15,2015-02-15,2013,2,2013-02-01
1115,7.0,0.07,11.3,6,0.99572,3.38,1.69,low,2013-01-15,2015-02-15,2013,2,2013-02-01
349,9.1,0.0,9.4,6,0.9994,3.36,1.68,low,2013-01-15,2015-02-15,2013,2,2013-02-01


In [27]:
# Sorting by values
df2_sorted = df2.sort_values(['alcohol','pH'])
df2_sorted.head()

Unnamed: 0,fixed_acidity,citric_acid,alcohol,quality,density,pH,half_pH,high_pH,date1,date2,date1_year,date2_month,date1_next
544,14.3,0.74,8.4,6,1.0008,2.86,1.43,low,2013-01-15,2015-02-15,2013,2,2013-02-01
517,10.4,0.49,8.4,3,0.9994,3.16,1.58,low,2013-01-15,2015-02-15,2013,2,2013-02-01
1436,10.0,0.38,8.5,5,0.99914,3.15,1.575,low,2013-01-15,2015-02-15,2013,2,2013-02-01
371,7.9,0.4,8.7,6,0.9967,3.32,1.66,low,2013-01-15,2015-02-15,2013,2,2013-02-01
528,8.2,0.49,8.7,6,0.9988,3.34,1.67,low,2013-01-15,2015-02-15,2013,2,2013-02-01


In [28]:
# String processing
# Length
df2['high_pH'].str.len().head()

0    4
1    3
2    3
3    3
4    4
Name: high_pH, dtype: int64

In [29]:
# Length - exclude trailing blanks
df2['high_pH'].str.rstrip().str.len().head()

0    4
1    3
2    3
3    3
4    4
Name: high_pH, dtype: int64

In [30]:
# Find, returns a value of -1 if the method fails to find the substring
# NOTE: remember that Python is case sensitive
# df2['high_pH'].str.find('S').head()
df2['high_pH'].str.find('s').head()

0   -1
1   -1
2   -1
3   -1
4   -1
Name: high_pH, dtype: int64

In [31]:
# Substring
# NOTE: Python indexes are zero-based
df2['high_pH'].str[0:1].sample(5)

372     l
1097    l
1450    l
521     h
1170    l
Name: high_pH, dtype: object

In [32]:
# Scan
# A simple approach to substring methods within Python
firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']})
firstlast['First_Name'] = firstlast['String'].str.split(" ",expand=True)[0]
firstlast['Last_Name'] = firstlast['String'].str.split(" ",expand=True)[1]
firstlast

Unnamed: 0,String,First_Name,Last_Name
0,John Smith,John,Smith
1,Jane Cook,Jane,Cook


In [33]:
# Upcase, lowcase and propcase
firstlast['string_up'] = firstlast['String'].str.upper()
firstlast['string_low'] = firstlast['String'].str.lower()
firstlast['string_prop'] = firstlast['String'].str.title()
firstlast

Unnamed: 0,String,First_Name,Last_Name,string_up,string_low,string_prop
0,John Smith,John,Smith,JOHN SMITH,john smith,John Smith
1,Jane Cook,Jane,Cook,JANE COOK,jane cook,Jane Cook


In [37]:
# Create two sample DataFrames for review
# Dataset 1
# Retain the columns to display within a list
column_list_1 = ["columns"]
df_m1 = df[column_list_1].sample(50000)
# Dataset 2
column_list_2 = ["columns2"]
df_m2 = df[column_list_2].sample(10000)

In [38]:
# Perform the joins
# Inner join
inner_join = df_m1.merge(df_m2, on=['id'], how='inner')
inner_join.shape

(4525, 7)

In [39]:
# Left join
left_join = df_m1.merge(df_m2, on=['id'], how='left')
left_join.shape

(50000, 7)

In [40]:
# Right join
right_join = df_m1.merge(df_m2, on=['id'], how='right')
right_join.shape

(10000, 7)

In [41]:
# Outer join
outer_join = df_m1.merge(df_m2, on=['id'], how='outer')
outer_join.shape

(55475, 7)

In [34]:
# Missing data - check the volume of missing values by variable
df2.isnull().sum().sort_values(ascending=False)

date1_next       0
date2_month      0
date1_year       0
date2            0
date1            0
high_pH          0
half_pH          0
pH               0
density          0
quality          0
alcohol          0
citric_acid      0
fixed_acidity    0
dtype: int64

In [36]:
# Drop missing values
df2_drop = df2.dropna()
df2_drop.shape

(1599, 13)

In [62]:
# Fill using the ffill method. The value from the previous record which has
# a not null value is retained
df_m1_filled = df_m1['date'].fillna(method='ffill')
df_m1_filled.head(10)

18011           None
105412          None
80936           None
76223           None
85880           None
87209           None
31362     2019-03-01
20696     2019-07-01
1330      2019-07-01
43125     2019-07-01
Name: arrears_date, dtype: object

In [37]:
# GroupBy
pd.options.display.float_format = '{:,.0f}'.format
# Aggregation
df2_summed = df2.groupby('high_pH')[['alcohol','pH']].sum()
df2_summed

Unnamed: 0_level_0,alcohol,pH
high_pH,Unnamed: 1_level_1,Unnamed: 2_level_1
high,4144,1358
low,12522,3936


In [38]:
# Performing aggregate view
df2.groupby(['high_pH', 'date1_year']).agg(
        # Sum alcohol
        sum_alc=('alcohol', sum)
       # , # Average alcohol
        #avg_alc=('alcohol', mean)
        , # Min alcohol
        min_alc=('alcohol', min)
        , # Max alcohol
        max_alc=('alcohol', max)
        , # Sum pH
        sum_pH=('pH', sum)
        , # Max pH
        max_pH=('pH', max)
        )

Unnamed: 0_level_0,Unnamed: 1_level_0,sum_alc,min_alc,max_alc,sum_pH,max_pH
high_pH,date1_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
high,2013,4144,9,14,1358,4
low,2013,12522,8,15,3936,3


In [39]:
# Neater approach to aggregate data
df2.groupby(['high_pH','date1_year']).agg(
        {
            "alcohol": [sum, min, max,'mean','median']
            ,"pH": [sum, min,max]
        }
        )

Unnamed: 0_level_0,Unnamed: 1_level_0,alcohol,alcohol,alcohol,alcohol,alcohol,pH,pH,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,max,mean,median,sum,min,max
high_pH,date1_year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
high,2013,4144,9,14,11,11,1358,3,4
low,2013,12522,8,15,10,10,3936,3,3


In [40]:
# Use the ravel() method to create easier column names
df2_summary = df2.groupby(['high_pH','date1_year']).agg(
        {
            "alcohol": [sum, min, max,'mean','median']
            ,"pH": [sum, min,max]
        }
        )
df2_summary.columns = ["_".join(x) for x in df2_summary.columns.ravel()]
df2_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,alcohol_sum,alcohol_min,alcohol_max,alcohol_mean,alcohol_median,pH_sum,pH_min,pH_max
high_pH,date1_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
high,2013,4144,9,14,11,11,1358,3,4
low,2013,12522,8,15,10,10,3936,3,3


In [41]:
df2_summary.columns

Index(['alcohol_sum', 'alcohol_min', 'alcohol_max', 'alcohol_mean',
       'alcohol_median', 'pH_sum', 'pH_min', 'pH_max'],
      dtype='object')

In [43]:
# Neater approach to aggregate data
df2.groupby(['high_pH','date1_year']).agg(
        {
            "pH": [sum, min, max,'mean','median']
        }
        )

Unnamed: 0_level_0,Unnamed: 1_level_0,pH,pH,pH,pH,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,max,mean,median
high_pH,date1_year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
high,2013,1358,3,4,4,3
low,2013,3936,3,3,3,3


In [44]:
# By group processing
df2.groupby(['high_pH','date1_year']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,fixed_acidity,citric_acid,alcohol,quality,density,pH,half_pH,date1,date2,date2_month,date1_next
high_pH,date1_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
high,2013,7,0,9,5,1,4,2,2013-01-15,2015-02-15,2,2013-02-01
low,2013,8,0,10,5,1,3,2,2013-01-15,2015-02-15,2,2013-02-01
