In [None]:
import os
work_dir = os.getcwd()
print(work_dir)
work_dir = os.path.join(work_dir,"_data")
print(work_dir)
# currently working directory is parent folder of data storage folder which is _data as shown below. In order to open file it must be in the same directory as my .csv in this case!
# Python02 > _data > .csv

# Relative path needs to be specified here
# Accessing the CSV file using a relative path. The "_data" folder is in the current working directory.
# Ensure the working directory is set to the main project folder for the path to work.
import pandas as pd
data_frame = pd.read_csv("_data/BostonHousingData.csv")
data_frame.head(10)

#Instead of relative path, using os.path.join() creates a more robust and platform-independant way to construct a file path relative

import os
import pandas as pd

work_dir = os.getcwd()
csv_path = os.path.join(work_dir,"_data","BostonHousingData.csv") # constructing the full path 
data_set =pd.read_csv(csv_path)
data_set



In [None]:
data_set.shape
# .shape is not a method of the `pd.DataFrame` class; it is an attribute of the `pd.DataFrame` class object, so we do not go for `data_set.shape()` but rather `data_set.shape`. Now, what is `data_set.shape`?
# data_set.shape is tuple
type(data_set.shape)
data_set.axes   # Rows and columns are - data_set axes

#There are two types of index in a `pd.DataFrame`: the row index and the column index.
data_set.index #[R, C] as we previously learned. It will also show, based on the data frame rows and columns.
my_data = {"IT": {'Name': 'Mark', 'Age': 33, 'Gender': 'Male'},
           "Design": {'Name': 'Maria', 'Age': 28, 'Gender': 'Female'}
           }
data = pd.DataFrame(my_data)
display(data)
print(data.axes)
data.index
# [Index(['Name', 'Age', 'Gender'],-> ROWS  dtype='object'), Index(['IT', 'Design'], dtype='object')] - > COLUMNS [R,C]
# Index(['Name', 'Age', 'Gender'], dtype='object') ROWS
# Index(['IT', 'Design'], dtype='object')] COLUMNS

In [None]:
data_set.dtypes #What are out types of data in the columns?

# crim: per capita crime rate by town.
# zn: proportion of residential land zoned for lots over 25,000 sq.ft.
# indus: proportion of non-retail business acres per town.
# chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
# nox: nitrogen oxides concentration (parts per 10 million).
# rm: average number of rooms per dwelling.
# age: proportion of owner-occupied units built prior to 1940.
# dis: weighted mean of distances to five Boston employment centres.
# rad: index of accessibility to radial highways.
# tax: full-value property-tax rate per $10,000.
# ptratio: pupil-teacher ratio by town.
# black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
# lstat: lower status of the population (percent).
# medv: median value of owner-occupied homes in $1000s.

data_set.size #data points in total?

In [None]:
data_set.size == data_set.shape[0]*data_set.shape[1] #506 * 14 = 7084
data_set.shape


In [None]:
# 3.`pd.DataFrame` indexes, subsetting (or slicing), `loc` and `iloc` 

# `loc` picks rows (and/or columns) with **particular labels**, while
# `iloc` picks rows (and/or columns) at **integer locations**. 




In [None]:
#For better visualisation, I will use the next data for it:
my_data = pd.DataFrame({'name':['Maria','Slobodan','Mark','Jack'],
                        'language':['English', 'Italian', 'English', 'French']},
                        index=[50, 35, 0, 7])
display(my_data)
#Lets get 'name', the whole column
# NOT WORKING? Why? When it's said that '.loc' pick rows (and/or columns) with % labels % ?
# As it's said , '.loc' pick rows and here , in my_data, 'name' is column label :D
# to solve this, we must choose rows first , then column labels. Lets get all the rows for the 'name' column!
print(my_data.loc[:,'name']) # [:, 'name'] - > ':' means all the rows. 'name' = label of the column
#Lets get information from my_data based on Integer location, for which we will use '.iloc'_data
print(my_data.iloc[0:2,:]) # '[0:2, :] - >  '0:2' means get all the rows from 0 to 2 without the end range [index 50, 35]. ':' means get all the columns, as in this case it is 'name' + 'language'

In [None]:
# with properly formed indexes, loc can have extended capabilities:
my_data = pd.DataFrame({'names':['Maria','Slobodan','Mark','Jack'],
                        'languages':['English', 'Italian', 'English', 'French']},
                        index = [12,13,14,15]) # Important thing to note here, that 'index' is part of pandas data frame and not of python dictionary!!!!!!
my_data
my_data.loc[12:15,'names'] #rows labeled with 12,13,14 and column labeled 'names'
my_data.loc[12,:] #row labeled with index 12 and all the columns
#Boolean Selection with `loc`:
my_data.loc
my_data.loc[[True, False, False, True]],['names'] 
# The list [True, True, False, True] corresponds to each row in the DataFrame in the same order.
#boolean list passed to my_data determines which rows are selected. For value=True row is selected and for value=False, it is not.  

In [None]:
# 4. More slicing of a `pd.DataFrame` 

#Grab the whole columns 'medv' from data_set
data_set
print(data_set.loc[:,'medv'])
#also , I could grab it like this
data_set.iloc[:, 13]
#and also like this
data_set.iloc[:, -1]

#Grab indus
display(data_set.head(10))
data_set.iloc[:, 2]
#also like this
data_set['indus'][0:10] # This specifies 'indus' column and rows 0:10, without the upper limit (10).
data_set[['indus','medv']][0:10] #get 0:10 rows for columns 'indus' and 'medv' Easy peasy




In [None]:
#Select `crim` and `age` where `medv>30`:
display(data_set.head(10))
data_set.loc[data_set['medv'>30]['crim','age']]
# The condition data_set['medv'] > 30 filters rows where the 'medv' column values are greater than 30, returning a boolean Series.
# The filtered rows are then used to select the 'crim' and 'age' columns using .loc[], which retrieves the desired data.

# Select `rad`, `rm` and `zn` where `rad==1`:

data_set.loc[data_set['rad']==1,['rad','rm','zn']]

In [None]:
# Select `rad`, `rm` and `zn` where `rad==1`:
data_set.loc[data_set['rad']==1,['rad','rm','zn']] # Here you should be careful for the [] order


In [None]:
#Select `medv` where `mev >= 20` and turn it into a list:
display(data_set)
data_set.loc[data_set['medv']>=20,['medv']]
data_list = list(data_set.loc[data_set['medv']>=20,['medv']])
print(data_list)

#Find something at a particular value of a column:

data_set.loc[data_set['rm']==15,['medv','rad']]

#Use multiple conditions:
data_set.loc[(data_set['medv']>=20) % (data_set['rm']==15)]

# Use particular values only: !
selected_medv =[12,15,19,20]
data_set.loc[data_set['medv'].isin(selected_medv), ['medv','rm']]

# .isin() is a method used in pandas to filter data within a DataFrame or Series by checking whether each element is present in a given list, set, or other iterable. 
# It's commonly used for filtering rows based on specific values.

#This method could be used without '.loc and .iloc'

data_set['medv'].isin(selected_medv) # Boolean 

data_set[data_set['medv'].isin(selected_medv)] #Needs to pass the data_set !

# Grab the first ten values of `medv`:
data_set['medv'][0:10]

# Grab the first ten values of `medv` and `crim`:

data_set[['medv','crim']][0:10]
type(data_set[['medv','crim']][0:10]) # pandas.core.frame.DataFrame

######## Check This ########### Just a little bit of exercise here with dictionaries 
name = pd.Series(['Marko', 'Ana','Petra','Luka'],
index=['a','b','c','d'])
age = pd.Series([15,19,22,25]),
index = [1,2,3,4]
print(name)
print(age)

my_dict = pd.DataFrame({'name': name, 'age': age}, 
                       index = [1,2,3,4])
display(my_dict)

my_dict.size
my_dict + 1
# What actually happens when I add '+1' to my data frame
# NUMERIC COLUMNS = each numeric value in data frame will have +1 added to it
# NaN (not a number) and non numeric columns will cause an error ( because we add +1 to a type = string) - > 
# If a column contains NaN, the result will remain NaN because arithmetic with NaN produces NaN.
#lets see in the example

import pandas as pd

test_data = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': ['X', 'Y', 'Z']})
test_data.size



In [None]:
# Can I do this things with lists?
dlist =list(test_data)
print(dlist)
dlist+1
dlist**2
# Can't do it with list :)


In [None]:
#Drop a column from a DataFrame and put it back
# AXIS 0 = > ROW ; AXIS 1 = > COLUMNS
# Lets test with 'medv'
display(data_set.head(10))
medv = data_set['medv'] #Extract the medv column as a pandas Series and assign it to the variable medv (medv now contains the data from the medv column only.)
print(medv.head(10))
display(data_set.drop('medv',axis=1)) # Removes a column or row from the DataFrame. Axis=1 Indicates that the operation applies to columns (for rows, use axis=0).

# Very important note: The drop method does not modify the data_set DataFrame in-place unless you pass inplace=True.
display(data_set.head(5)) # contains medv column
display(data_set.drop('medv', axis=1, inplace=True)) # Removes a column or row from the DataFrame permanently!
 

In [None]:
#Drop a column from a DataFrame and put it back
display(data_set)
# Basic way of putting column back to data set:
data_set['medv']=medv
print(data_set.head(5))
display(data_set.head(5))

# Using .concat() method:
data_set = pd.concat([data_set, medv], axis=1) 
display(data_set)
#pd.concat takes a list of DataFrames/Series as the first argument and combines them along the specified axis (0 for rows, 1 for columns).
# Designed specifically for pandas DataFrames and Series. it is Pandas library method
#pd.concat() returns a new DataFrame and does not modify the original data_set

In [None]:
# Change the order of columns in a DataFrame:
import pandas as pd
my_data = {'Name': ['Marija', 'Ann', 'Tony', 'Sabina', 'Peter'],           
           'Class': ['A', 'A', 'B', 'C', 'D'],
           'Grade': [55, 73, 81, 78, 12]}
my_data = pd.DataFrame(my_data)
display(my_data)

columns = ['Name', 'Class', 'Grade']
new_order = ['Class', 'Name', 'Grade']
my_data = my_data[new_order]
display(my_data)

# Use a Sorting Function (For Automatic Reordering
my_data = my_data[sorted(my_data.columns)]
# this method with initialize 
my_data = my_data.reindex(columns)
display(my_data)

# Why for some syntax we use [] and for some just ()? This remained unclear in my head for some time but here is the explanation:
# When you use square brackets ([]) with a DataFrame, you are explicitly telling pandas to select specific columns in the order specified.
# Syntax: my_data[columns] expects a list of column names in the desired order inside the brackets. Without the brackets, the syntax is invalid.
# Why Needed?: The square brackets are required because this is how pandas processes column selection when directly accessing columns by names or order.
# for ():
# Method Call (.reindex()): Already knows you're modifying the columns through its columns parameter, so brackets are not part of its syntax.

In [None]:
### 5. A few more things I know about lists, dictionaries, and DataFrames! :)
#This our tips & tricks section in this notebook: "how to" recipes. 

# A `pd.DataFrame` from a nested list:
my_data = [['Milk', 10], ['Sugar', 15], ['Sour Cream', 13]]
my_data = pd.DataFrame(my_data, columns = ['Item', 'Price'])
display(my_data)
##########
a = [1,2,3]
b = ['a','b','c']
my_data = pd.DataFrame(a, b)
display(my_data)
my_data.index
# This will probably raise an error because  a = data, b = index (row labels). Lenght of b must match the lenght of a. Pandas expect aligment
# When a is passed as a list ([1,2,3]) pandas assumes it represents data for rows and b would label those rows
# In order to create two list that are aligned:
a = [1,2,3]
b = ['a', 'b', 'c']
my_data = pd.DataFrame({'Column1': a, 'Column2': b})
display(my_data)

#  create a pandas DataFrame from a nested list, specifying column names and row indices
data = [['col1', 'col2', 'col3'],[1,2,3],[4,5,6]]
my_data = pd.DataFrame(data[1:],
                       columns=data[0],
                       index=["row1","row2"])


# [index]       col 1 col2 col3 - [columns]
#  row 1     1     2     3    data
#  row 2     4     5     6    data



In [None]:
#Create a DataFrame from the following data:
data = [['Country', 'Capital', 'Population'], 
 ['Serbia', 'Belgrade', 7000000], 
 ['Germany', 'Berlin', 83000000], 
 ['Japan', 'Tokyo', 126000000]]

##Use the first row for column names.
#Assign custom row indices: Country1, Country2, Country3

my_list = pd.DataFrame(data[1:], 
                       columns=data[0],
                       index=['Country-1','Country-2', 'Country-3'])
display(my_list)
# Add a new row for France with Paris as its capital and a population of 6700000
my_list.loc['Country 4'] =['France','Paris',6700000]
display(my_list)
# adding new row by using pd.concat() method
new_row =pd.DataFrame([['Paris','France',6700000]], clumns = data.columns, index=['Country-4'])
my_list = pd.concat([data,new_row])
display(my_list)

# Explanation:
# columns=my_data.columns: 
# This ensures that the new row has the same column names as my_data. For example, the columns might be ['Country', 'Capital', 'Population'], which you want to keep consistent.
# pd.concat([data, new_row]): The concat() function is used to concatenate (combine) two DataFrames (data and new_row) along a particular axis. 
# By default, pd.concat() concatenates along axis=0, which means it adds the rows vertically.



In [None]:
# Create a Data Frame from multiple list:

name =  ['John','Peter','Mike','Andrew']
salary = [4000,3500,1500,5000]
yrs = [1,3,2,4]

my_data = pd.DataFrame({'name':name, 
                        'salary':salary,
                        'yrs':yrs},
                        index=[1,2,3,4])
                        
display(my_data)

# Now lets do the same using .zip()

name =  ['John','Peter','Mike','Andrew']
salary = [4000,3500,1500,5000]
yrs = [1,3,2,4]
my_zip_data = pd.DataFrame(list(zip(name,salary,yrs)))
columns = ['Name','Salary','Years']
my_zip_data.columns = columns
print(my_zip_data)

#I can also add columns in one line of code such as:

my_zip_data = pd.DataFrame(list(zip(name,salary,yrs)), columns=['Name','Salary','Years'])
#.zip() # zip produces an iterator, not a list.
# zip(name,salary,yrs) will create this = [(John,4000,1),(Peter,5000,2),(Mark,7000,3)]
#While zip itself is efficient and lazy (it doesn't create the entire output in memory immediately), pandas requires a concrete structure like a list to initialize a DataFrame. 
# By wrapping zip with list, we provide pandas with a list of rows to work with.




In [None]:
l1 = [1,2]
l2 = ['a', 'b']
z = zip(l1,l2)
z =list(z) # Here, without this step, we won't get the expected results. Thats 
print(z)
x=z[1]

# this is an iterator, so you can't directly access its contents without converting it to a concrete structure like a list or a dictionary.
# instead make it dict.
l1 = [1,2]
l2 = ['a', 'b']
z = zip(l1,l2)
z = dict(z)
z[1] #Position 1 = 'a'
z[2] #position 2 = 'b"

In [131]:
#Create a Dictionary from tuples with zip():
keys = ('name', 'last_name', 'department')
values = (('Marija','Marko'), ('Janic','Miskovic'), ('Anthroplogy', 'Philosophy'))
my_data = dict(zip(keys, values))
print(my_data)

TypeError: 'zip' object is not callable

In [133]:
#The same with lists:
keys = ['name', 'last_name', 'department']
values = [['Marija','Marko'], ['Janic','Miskovic'], ['Anthroplogy', 'Philosophy']]
my_data = dict(zip(keys, values))
print(my_data)

TypeError: 'zip' object is not callable

In [136]:
 #From dictionary to `pd.DataFrame`, take 1:
my_data = {'Name':['Maria','Anna','Sophia','Deborah'],
           'Height':[180, 167,152,177]}

my_data=pd.DataFrame(my_data)
print(my_data)

#From dictionary to `pd.DataFrame`, take 2:
my_data = {"Name": {'0':'Maria', '1':'Anna', '2':'Sophia', '3':'Deborah'},
           "Height": {'0':180, '1':167, '2':152, '3':177}
           }
my_data = pd.DataFrame(my_data)
display(my_data)

      Name  Height
0    Maria     180
1     Anna     167
2   Sophia     152
3  Deborah     177


Unnamed: 0,Name,Height
0,Maria,180
1,Anna,167
2,Sophia,152
3,Deborah,177
