# Tutorial on:
### 1. Python data structures - lists, sets, dictionaries, and tuples
### 2. Functions & Classes
### 3. Numpy - Arrays, Array math
### 4. Pandas - Loading a file, simple SQL queries

#### Importing required packages

In [11]:
import numpy as np
import pandas as pd

#### Basics on list,set, and dictionary

In [19]:
sample_list = []
sample_set = set()
sample_dict = dict()

# Example list operations
# Populating a list
sample_list.append(1)
sample_list.append(2)
sample_list.append(3)
print('Length of sample_list: ' + str(len(sample_list))) # Prints 3

# Populating the set
sample_set.add(1)
sample_set.add(2)
sample_set.add(2)
sample_set.add(2)
sample_set.add(3)
print('Length of sample set: ' + str(len(sample_set))) # Prints 3 (Set can have only unique values)


# Populating the dictionary
sample_dict['A'] = 1
sample_dict['B'] = 2
sample_dict['C'] = 3
print('Length of sample_dict: ' + str(len(sample_dict))) # Prints 3
print('Keys of sample_dict:')
print(sample_dict.keys())
print('Value of the key "A" in sample_dict: ' + str(sample_dict['A']))

print('Printing all list, set, and dictionary')
print(sample_list)
print(sample_set)
print(sample_dict)

Length of sample_list: 3
Length of sample set: 3
Length of sample_dict: 3
Keys of sample_dict:
dict_keys(['A', 'B', 'C'])
Value of the key "A" in sample_dict: 1
Printing all list, set, and dictionary
[1, 2, 3]
{1, 2, 3}
{'A': 1, 'B': 2, 'C': 3}


#### Basics on numpy

In [21]:
# Creating a simple array
print('1. Simple array.....')
array1 = np.array([1,2,3])
print(type(array1))
print(array1.shape)
print('\n')

# Creating a matrix
print('2. Matrix.....')
array2 = np.array([[1,2,3],[4,5,6]])
print(array2.shape)
print(array2[1,2])
print('\n')

# Creating an identity matrix of shape 3*3
print('3. Identity Matrix.....')
array3 = np.eye(3)
print(array3)
print()

# Creating a 4*4 matrix with all zeroes
print('4. Zero matrix.....')
array4 = np.zeros((4,4))
print(array4)
print()


# Creating a 5*5 matrix with all ones
print('5. All-ones matrix.....')
array5 = np.ones((5,5))
print(array5)
print()

# Creating a 6*6 matrix with all random values
print('6. Random matrix.....')
array6 = np.random.random((6,6))
print(array6)
print()

# Getting data type of elements in the random matrix
print('7. Data type of the random matirx.....')
print(array6.dtype)
print()

# Getting square root of elements in the given matrix
print('8. Square root of the random matrix.....')
print(np.sqrt(array6))
print()

1. Simple array.....
<class 'numpy.ndarray'>
(3,)


2. Matrix.....
(2, 3)
6


3. Identity Matrix.....
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]

4. Zero matrix.....
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

5. All-ones matrix.....
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]

6. Random matrix.....
[[0.91178265 0.85476908 0.57816056 0.90163698 0.69648952 0.0796907 ]
 [0.96056454 0.72223656 0.57350247 0.74104907 0.17360485 0.85945986]
 [0.75386554 0.81660472 0.7850079  0.2310595  0.12658271 0.42679972]
 [0.98669715 0.4935387  0.48218513 0.72956391 0.44290166 0.60187535]
 [0.47375326 0.44026527 0.76952098 0.03459023 0.41618507 0.99160378]
 [0.55445886 0.8665857  0.08016946 0.03149029 0.74493102 0.34135699]]

7. Data type of the random matirx.....
float64

8. Square root of the random matrix.....
[[0.95487311 0.92453723 0.7603687  0.94954567 0.83455948 0.28229542]
 [0.98008395 0.84984502 0.75729946 0.86084207 0.41665916 0.9270705

In [22]:
# Basic matrix operations
print('Elementwise Addition, Subtraction, Multiplication, and Division of two matrices.....')
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

print('Matrix x:')
print(x)
print()

print('Matrix y:')
print(y)
print()

print('x+y:')
print(np.add(x,y))
print()

print('x-y')
print(np.subtract(x,y))
print()

print('x*y:')
print(np.multiply(x,y))
print()

print('x/y:')
print(np.divide(x,y))
print()

Addition, Subtraction, Multiplication, and Division of two matrices.....
Matrix x:
[[1. 2.]
 [3. 4.]]

Matrix y:
[[5. 6.]
 [7. 8.]]

x+y:
[[ 6.  8.]
 [10. 12.]]

x-y
[[-4. -4.]
 [-4. -4.]]

x*y:
[[ 5. 12.]
 [21. 32.]]

x/y:
[[0.2        0.33333333]
 [0.42857143 0.5       ]]



In [23]:
print('Matrix multiplication of x,y:')
print(np.dot(x,y))
print()

print('Transpose of matrix x:')
print(x.T)

Matrix multiplication of x,y:
[[19. 22.]
 [43. 50.]]

Transpose of matrix x:
[[1. 3.]
 [2. 4.]]


In [24]:
# Creating an array with 8 elements in it - It creates 1*8 matrix
array7 = np.arange(8)
print('Array of 8 elements:')
print(array7)
print()

print('Reshaping 8 element array into 2*4 matrix:')
print(array7.reshape(2,4))
print()

print('Reshaping 8 element array into 8*1 matrix:')
print(array7.reshape(8,1))
print()

Array of 8 elements:
[0 1 2 3 4 5 6 7]

Reshaping 8 element array into 2*4 matrix:
[[0 1 2 3]
 [4 5 6 7]]

Reshaping 8 element array into 8*1 matrix:
[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]]



In [25]:
print('Converting a matrix to a list:')
print(array6.tolist())

Converting a matrix to a list:
[[0.9117826537389395, 0.8547690833008422, 0.578160564086967, 0.9016369834481713, 0.6964895198874864, 0.07969070313486548], [0.9605645421455072, 0.7222365627581433, 0.5735024719796622, 0.7410490651374491, 0.17360485307032503, 0.8594598645476852], [0.7538655412315126, 0.8166047236617007, 0.7850078995564739, 0.23105950073800507, 0.12658270810980576, 0.42679971900412184], [0.9866971501406567, 0.49353870359766605, 0.4821851347557409, 0.7295639136000754, 0.4429016582743115, 0.6018753541000327], [0.4737532644376251, 0.4402652704547938, 0.7695209839292242, 0.03459022575690218, 0.41618506532055044, 0.9916037804537909], [0.5544588624091077, 0.866585699954466, 0.08016945988363877, 0.03149028582658009, 0.744931016867126, 0.3413569914305251]]


#### Basics on pandas

In [27]:
print('Loading the given .csv file...')
input_path = 'SalesJan2009.csv'

data_df = pd.read_csv(input_path) # Reading the .csv file and it creates adataframe

print('Available columns in the dataframe.....')
print(data_df.columns)

Loading the given .csv file...
Available columns in the dataframe.....
Index(['Transaction_date', 'Product', 'Price', 'Payment_Type', 'Name', 'City',
       'State', 'Country', 'Account_Created', 'Last_Login', 'Latitude',
       'Longitude'],
      dtype='object')


In [28]:
print('Complete data')
data_df

Complete data


Unnamed: 0,Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
0,1/2/2009 6:17,Product1,1200,Mastercard,carolina,Basildon,England,United Kingdom,1/2/2009 6:00,1/2/2009 6:08,51.500000,-1.116667
1,1/2/2009 4:53,Product1,1200,Visa,Betina,Parkville,MO,United States,1/2/2009 4:42,1/2/2009 7:49,39.195000,-94.681940
2,1/2/2009 13:08,Product1,1200,Mastercard,Federica e Andrea,Astoria,OR,United States,1/1/2009 16:21,1/3/2009 12:32,46.188060,-123.830000
3,1/3/2009 14:44,Product1,1200,Visa,Gouya,Echuca,Victoria,Australia,9/25/2005 21:13,1/3/2009 14:22,-36.133333,144.750000
4,1/4/2009 12:56,Product2,3600,Visa,Gerd W,Cahaba Heights,AL,United States,11/15/2008 15:47,1/4/2009 12:45,33.520560,-86.802500
5,1/4/2009 13:19,Product1,1200,Visa,LAURENCE,Mickleton,NJ,United States,9/24/2008 15:19,1/4/2009 13:04,39.790000,-75.238060
6,1/4/2009 20:11,Product1,1200,Mastercard,Fleur,Peoria,IL,United States,1/3/2009 9:38,1/4/2009 19:45,40.693610,-89.588890
7,1/2/2009 20:09,Product1,1200,Mastercard,adam,Martin,TN,United States,1/2/2009 17:43,1/4/2009 20:01,36.343330,-88.850280
8,1/4/2009 13:17,Product1,1200,Mastercard,Renee Elisabeth,Tel Aviv,Tel Aviv,Israel,1/4/2009 13:03,1/4/2009 22:10,32.066667,34.766667
9,1/4/2009 14:11,Product1,1200,Visa,Aidan,Chatou,Ile-de-France,France,6/3/2008 4:22,1/5/2009 1:17,48.883333,2.150000


In [29]:
print('Number of rows in the dataframe: ' + str(len(data_df)))

Number of rows in the dataframe: 998


In [30]:
print('Filling empty values with the keyword "Sample"')
new_df2 =data_df.fillna('Sample')
new_df2

Filling empty values with the keyword "Sample"


Unnamed: 0,Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
0,1/2/2009 6:17,Product1,1200,Mastercard,carolina,Basildon,England,United Kingdom,1/2/2009 6:00,1/2/2009 6:08,51.500000,-1.116667
1,1/2/2009 4:53,Product1,1200,Visa,Betina,Parkville,MO,United States,1/2/2009 4:42,1/2/2009 7:49,39.195000,-94.681940
2,1/2/2009 13:08,Product1,1200,Mastercard,Federica e Andrea,Astoria,OR,United States,1/1/2009 16:21,1/3/2009 12:32,46.188060,-123.830000
3,1/3/2009 14:44,Product1,1200,Visa,Gouya,Echuca,Victoria,Australia,9/25/2005 21:13,1/3/2009 14:22,-36.133333,144.750000
4,1/4/2009 12:56,Product2,3600,Visa,Gerd W,Cahaba Heights,AL,United States,11/15/2008 15:47,1/4/2009 12:45,33.520560,-86.802500
5,1/4/2009 13:19,Product1,1200,Visa,LAURENCE,Mickleton,NJ,United States,9/24/2008 15:19,1/4/2009 13:04,39.790000,-75.238060
6,1/4/2009 20:11,Product1,1200,Mastercard,Fleur,Peoria,IL,United States,1/3/2009 9:38,1/4/2009 19:45,40.693610,-89.588890
7,1/2/2009 20:09,Product1,1200,Mastercard,adam,Martin,TN,United States,1/2/2009 17:43,1/4/2009 20:01,36.343330,-88.850280
8,1/4/2009 13:17,Product1,1200,Mastercard,Renee Elisabeth,Tel Aviv,Tel Aviv,Israel,1/4/2009 13:03,1/4/2009 22:10,32.066667,34.766667
9,1/4/2009 14:11,Product1,1200,Visa,Aidan,Chatou,Ile-de-France,France,6/3/2008 4:22,1/5/2009 1:17,48.883333,2.150000


In [31]:
print('Droping off rows with atleast one empt values')
new_df = data_df.dropna()
print('no. of rows after droping empty values: ' + str(len(new_df)))
new_df

Droping off rows with atleast one empt values
no. of rows after droping empty values: 996


Unnamed: 0,Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
0,1/2/2009 6:17,Product1,1200,Mastercard,carolina,Basildon,England,United Kingdom,1/2/2009 6:00,1/2/2009 6:08,51.500000,-1.116667
1,1/2/2009 4:53,Product1,1200,Visa,Betina,Parkville,MO,United States,1/2/2009 4:42,1/2/2009 7:49,39.195000,-94.681940
2,1/2/2009 13:08,Product1,1200,Mastercard,Federica e Andrea,Astoria,OR,United States,1/1/2009 16:21,1/3/2009 12:32,46.188060,-123.830000
3,1/3/2009 14:44,Product1,1200,Visa,Gouya,Echuca,Victoria,Australia,9/25/2005 21:13,1/3/2009 14:22,-36.133333,144.750000
4,1/4/2009 12:56,Product2,3600,Visa,Gerd W,Cahaba Heights,AL,United States,11/15/2008 15:47,1/4/2009 12:45,33.520560,-86.802500
5,1/4/2009 13:19,Product1,1200,Visa,LAURENCE,Mickleton,NJ,United States,9/24/2008 15:19,1/4/2009 13:04,39.790000,-75.238060
6,1/4/2009 20:11,Product1,1200,Mastercard,Fleur,Peoria,IL,United States,1/3/2009 9:38,1/4/2009 19:45,40.693610,-89.588890
7,1/2/2009 20:09,Product1,1200,Mastercard,adam,Martin,TN,United States,1/2/2009 17:43,1/4/2009 20:01,36.343330,-88.850280
8,1/4/2009 13:17,Product1,1200,Mastercard,Renee Elisabeth,Tel Aviv,Tel Aviv,Israel,1/4/2009 13:03,1/4/2009 22:10,32.066667,34.766667
9,1/4/2009 14:11,Product1,1200,Visa,Aidan,Chatou,Ile-de-France,France,6/3/2008 4:22,1/5/2009 1:17,48.883333,2.150000


In [32]:
print('Appending a new column in the dataframe....')
print('Creating a random array of length 998(no. of rows in the original dataframe)')
sample_column_array = np.random.random(len(data_df))

print('Adding a new column named "Example_Column" in the data frame')
data_df['Example_Column'] = sample_column_array.tolist() # Always convert the array to list while adding it to the dataframe
data_df

Appending a new column in the dataframe....
Creating a random array of length 998(no. of rows in the original dataframe)
Adding a new column named "Example_Column" in the data frame


Unnamed: 0,Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude,Example_Column
0,1/2/2009 6:17,Product1,1200,Mastercard,carolina,Basildon,England,United Kingdom,1/2/2009 6:00,1/2/2009 6:08,51.500000,-1.116667,0.905082
1,1/2/2009 4:53,Product1,1200,Visa,Betina,Parkville,MO,United States,1/2/2009 4:42,1/2/2009 7:49,39.195000,-94.681940,0.344003
2,1/2/2009 13:08,Product1,1200,Mastercard,Federica e Andrea,Astoria,OR,United States,1/1/2009 16:21,1/3/2009 12:32,46.188060,-123.830000,0.546625
3,1/3/2009 14:44,Product1,1200,Visa,Gouya,Echuca,Victoria,Australia,9/25/2005 21:13,1/3/2009 14:22,-36.133333,144.750000,0.975556
4,1/4/2009 12:56,Product2,3600,Visa,Gerd W,Cahaba Heights,AL,United States,11/15/2008 15:47,1/4/2009 12:45,33.520560,-86.802500,0.734670
5,1/4/2009 13:19,Product1,1200,Visa,LAURENCE,Mickleton,NJ,United States,9/24/2008 15:19,1/4/2009 13:04,39.790000,-75.238060,0.255695
6,1/4/2009 20:11,Product1,1200,Mastercard,Fleur,Peoria,IL,United States,1/3/2009 9:38,1/4/2009 19:45,40.693610,-89.588890,0.527148
7,1/2/2009 20:09,Product1,1200,Mastercard,adam,Martin,TN,United States,1/2/2009 17:43,1/4/2009 20:01,36.343330,-88.850280,0.669344
8,1/4/2009 13:17,Product1,1200,Mastercard,Renee Elisabeth,Tel Aviv,Tel Aviv,Israel,1/4/2009 13:03,1/4/2009 22:10,32.066667,34.766667,0.795691
9,1/4/2009 14:11,Product1,1200,Visa,Aidan,Chatou,Ile-de-France,France,6/3/2008 4:22,1/5/2009 1:17,48.883333,2.150000,0.582104


In [33]:
print('Getting a sample of the dataframe with selected columns')
sample_df = data_df[['Name','City','State','Country']]
sample_df

Getting a sample of the dataframe with selected columns


Unnamed: 0,Name,City,State,Country
0,carolina,Basildon,England,United Kingdom
1,Betina,Parkville,MO,United States
2,Federica e Andrea,Astoria,OR,United States
3,Gouya,Echuca,Victoria,Australia
4,Gerd W,Cahaba Heights,AL,United States
5,LAURENCE,Mickleton,NJ,United States
6,Fleur,Peoria,IL,United States
7,adam,Martin,TN,United States
8,Renee Elisabeth,Tel Aviv,Tel Aviv,Israel
9,Aidan,Chatou,Ile-de-France,France


In [34]:
# Querying the dataframe with rows that have Country = "United States"
sample_df2 = data_df[data_df['Country'] == 'United States']
print("No. of rows in the dataframe with Country = 'United States': " + str(len(sample_df2)))

No. of rows in the dataframe with Country = 'United States'463


In [35]:
# Querying the dataframe with multiple conditions
sample_df3 = data_df[(data_df['Country'] == 'United States') & (data_df['Price'] < '3000')]
print(len(sample_df3))

399
