# load data

In [None]:
import pandas as pd
import numpy as np

In [None]:
# add some extra random columns with bool values and int values
extra_col1 = np.random.randint(0, 2, (87569, 1), dtype=bool)
extra_col2 = np.random.randn(87569).reshape((87569, 1))
extra_col3 = np.random.randn(87569).reshape((87569, 1))
extra_col4 = np.random.randn(87569).reshape((87569, 1))

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com"
                 "/CoreyMSchafer/code_snippets/master"
                 "/Python/Matplotlib/02-BarCharts/data.csv",
                 ) # you can specify name of rows directly when loading data, parameter index_col=name of desired column
df["some_column1"] = extra_col1
df["some_column2"] = extra_col2
df["some_column3"] = extra_col3
df["some_column4"] = extra_col4

In [None]:
df.shape # shape is an attribute, not a method, thats why no parentheses

In [None]:
# terminology:

"series - data of only one column"

"dataframe - data of multiple series"

# basic checking of data

In [None]:
# head will display only limited number of rows and columns,
#   except pd.set_option("display.max_columns", XXX) is set, to specify desired nr of cols and rows
df.head(2)

In [None]:
# this is the same as head, but from the end
df.tail(2)

In [None]:
df.info()

In [None]:
# set custom number of columns to be printed
"""pd.set_option("display.max_columns", 48)"""

# set custom number of rows to be printed
"""pd.set_option("display.max_rows", XXX)"""

In [None]:
df.head()

# selecting rows and columns

In [None]:
# some idea how might pandas work

# here the keys might serve as name of columns and indexes of values (lists) as rows
custom_dict = {"name": ["Jon", "Ponna", "Lada"],
               "age": [20, 32, 23],
               "sex": ["Male", "Female", "Female"]}

custom_dict["name"]

In [None]:
# try to load custom data to pandas

custom_df = pd.DataFrame(custom_dict)
custom_df

In [None]:
# acces column

# it returns the same data as the basic dictionary above, but in pandas format
custom_df["name"] 

In [None]:
# access column, option 2 
# (df.columnname might cause errors, it might get confused with some method)
custom_df.name

In [None]:
# access more columns
custom_df[["name", "sex"]]

In [None]:
# get name of columns
custom_df.columns

In [None]:
# get columns and rows using "iloc" (integer location)

# access row
custom_df.iloc[0] # acces by integer location

In [None]:
# access multiple rows
custom_df.iloc[[0, 2]]

In [None]:
# access (with coordinates) with iloc specified row/s and column/s
print(custom_df)
print(custom_df.iloc[0, -1]) # access first row, last column 

In [None]:
# more rows, and columns
custom_df.iloc[[1, 2], [1, 2]] # access only 2nd & 3rd row + 2nd & 3rd column

In [None]:
# slicing is possible, square brackets must not be used
custom_df.iloc[1:3, 1:3]

In [None]:
# get columns and rows using "loc" (search by name of row/column)
# seems that rows are also accessed by integers, but to filter name of 
#   column at the same time, name of column can be given, instead of integer
print(custom_df, end="\n"*2)

print(custom_df.iloc[0]) # acces everything at first row

In [None]:
# access more rows
custom_df.loc[[0, 2]]

In [None]:
# acces items based on row and name of column
custom_df.loc[[1, 2], ["sex", "age"]] # select second and third row, and columns by their name

In [None]:
# check names of all columns in a dataset
df.columns

In [None]:
# check value counts of our custom bool values column
df["some_column1"].value_counts()

In [None]:
# slice through columns
# it is not possible to do it only via square brackets. iloc or loc has to be used, and
#   rows can be ommited by colon ":" 
df.loc[0, "some_column1":"some_column3"]

In [None]:
df.loc[:, "some_column1":"some_column3"]

# How to Set, Reset, and Use Indexes

In [None]:
custom_dict2 = {"name": ["Jon", "Ponna", "Lada", "Jožo", "Prlajz"],
                "last name": ["Bobo", "Wale", "Boro", "Zajeci", ""],
               "age": [20, 32, 23, 50, 34],
               "sex": ["Male", "Female", "Female", "Male", "Dog"],
               "email": ["JJ@email.com", "pony@gmail.com", "borovice@seznam.cz", np.nan, np.nan],
               "bills": [1000, 2000, 1000, 400, 800],
               "salary": [20000, 25000, 40000, 18000, 19000],
               "country": ["USA", "Germany", "Czechia", "Slovakia", "Poland"]}

In [None]:
custom_df2 = pd.DataFrame(custom_dict2)
custom_df2

In [None]:
# set email addresses from above dataset as an index (names/labels of rows) of dataset
# with this we could use dataframe.loc to search by row name by email

# both rows do the same
"""custom_df2 = custom_df2.set_index("email")"""
custom_df2.set_index("email", inplace=True)

custom_df2

In [None]:
# check values by indexes
print(custom_df2.index) # name of rows
print()
print(custom_df2.sort_index(ascending=False, inplace=False)) # name of rows sorted, descending
print()
print(custom_df2.columns) # name of columns

In [None]:
custom_df2.loc["JJ@email.com"]

In [None]:
# return row names / indexes back to original values
custom_df2.reset_index(inplace=True)
custom_df2

# Using Conditionals to Filter Rows and Columns
& = AND;
| = OR;
~ = NOT;


In [None]:
# filter out only people that are Female
filter1 = (custom_df2["sex"] == "Female")

custom_df2[filter1]

In [None]:
# also possible to to the same with "loc"
# loc will allow us more options, like also filter columns etc
custom_df2.loc[filter1]

In [None]:
# filter emails of all females
custom_df2.loc[filter1, "email"]

In [None]:
# filter with more conditions
# (& = AND) (| = OR)

# filter everyone who meets atleast one of these conditions
filter2 = (custom_df2["name"] == "Jon") | (custom_df2["bills"] == 1000)

custom_df2.loc[filter2]

In [None]:
# filter everyone who meets exactly those two conditions
filter3 = (custom_df2["name"] == "Jon") & (custom_df2["bills"] == 1000)

custom_df2.loc[filter3]

In [None]:
# filter everyone who DOES NOT meets exactly those two conditions
# ~ sign

filter4 = ~(custom_df2["name"] == "Jon") & ~(custom_df2["bills"] == 1000)

custom_df2.loc[filter4]

In [None]:
# filter everyone with bills higher than 1000,

# < > operator

filter5 = custom_df2["bills"] > 1000
custom_df2.loc[filter5]

In [None]:
# filter everyone with salary below 20000 and from Czechia, Slovakia and Poland

countries = ["Czechia", "Slovakia", "Poland"]

filter_6a = custom_df2["salary"] < 20_000
filter_6b = custom_df2["country"].isin(countries) # filter all rows that have specified countires
filter6 = filter_6a & filter_6b

custom_df2.loc[filter6]

In [None]:
# filer only rows that contain a given string

# na = do not return values that have N/A in a given specification
filter7 = custom_df2["country"].str.contains("ia", na=False)

custom_df2.loc[filter7]

# Updating Rows and Columns - Modifying Data Within DataFrames

In [None]:
custom_df2.columns

In [None]:
# update names of all column names to uppercase

custom_df2.columns = [name.title() for name in custom_df2.columns]
custom_df2.columns

In [None]:
# rename column / columns

custom_df2.rename(columns={"Email": "Mail", "Name": "First_name"}, inplace=True)
custom_df2

In [None]:
# change a some singl value in a dataset

# change Jožos salary to 18500

filter8 = custom_df2["First_name"].str.contains("Jožo")

custom_df2.loc[filter8, "Salary"] = 18500
custom_df2

In [None]:
# change whole row

# have to assign a value to each column of a given row
custom_df2.iloc[0] = ["Jonny@email.com", "Jon", "Bobo", 21, "Male", 1400, 22000, "USA"]
custom_df2

In [None]:
# Change all values of a given column - integers

custom_df2["Salary"] = custom_df2["Salary"] + 10_000
custom_df2

In [None]:
# change all values of a given column - strings

custom_df2["Sex"] = custom_df2["Sex"].str.lower()
custom_df2

#### apply
use any ***function()*** that will be applied on each value of a given column

- can be used both on series and on dataframes

In [None]:
# use it on a series

custom_df2["Country"].apply(len)

In [None]:
# use it on a series

def make_upper_case(parameter):
  return parameter.upper()

custom_df2["Country"] = custom_df2["Country"].apply(make_upper_case)

# also possible with lambda
custom_df2["Sex"] = custom_df2["Sex"].apply(lambda parameter: parameter.title())

custom_df2

In [None]:
# use it on a dataframe

# this will return length of each series in a given dataframe
print(custom_df2.apply(len, axis="rows"))
print()
print(custom_df2.apply(len, axis="columns"))

#### apply map
this will work for each single value in series on the whole dataframe

In [None]:

# drop columns with n/a values and integers because len() would not work on that
custom_df3 = custom_df2.drop(index=[3, 4], columns=["Bills", "Salary", "Age"]) 

# custom_df3
custom_df3.applymap(len)

In [None]:
# make all string values of dataframe lower. (integers and na values have to be dorpped)
custom_df3.applymap(str.lower)

### map
works only on series. Map is used to replace given values by another given values in a siries

In [None]:
custom_df2

In [None]:
# replace every specified occurence, not specified occurences will be converted to N/A
custom_df4 = custom_df2.copy()
custom_df4["Sex"] = custom_df4["Sex"].map({"Male": "Chlap", "Female": "Zenska"})
custom_df4

In [None]:
# replace only specified occurences, not specified occurences will remain the same
custom_df5 = custom_df2.copy()
custom_df5["Sex"] = custom_df5["Sex"].replace({"Male": "Chlap", "Female": "Zenska"})
custom_df5

# Add/Remove Rows and Columns From DataFrames

In [None]:
# add a new row as a combination of existing ones
custom_df2["Full name"] = custom_df2["First_name"] + " " + custom_df2["Last Name"]
custom_df2

In [None]:
# remove columns

custom_df2.drop(columns=["First_name", "Last Name"], inplace=True)
custom_df2

In [None]:
# split existing column into multiple new columns

# operation at right side of '=' splits each string item into two items in a list, expand True
#   makes from this a pandas dataframe of two columns that are later assigned to their names
#   specified of the left side
custom_df2[["first name", "last name"]] = custom_df2["Full name"].str.split(" ", expand=True)
custom_df2

# add and remove rows

In [None]:
# add a new row, with only one value specified
custom_df2.append({"Full name": "Vjenc Stejskalu"}, ignore_index=True)

In [None]:
# append a whole dataset
custom_df6 = {"first name": ["Ragos", "Edward"],
              "Age": [99, 100]}
custom_df6 = pd.DataFrame(custom_df6)

custom_df2 = custom_df2.append(custom_df6, ignore_index=True)
custom_df2

In [None]:
# drop a row/rows
custom_df2.drop(index=[5, 6], inplace=True)
custom_df2

In [None]:
# drop rows with filter (similar like filtering with iloc/loc etc...)
filter = custom_df2["Salary"] < 30_000
custom_df2.drop(custom_df2[filter].index, inplace=True)
custom_df2

# sorting data

In [None]:
# sort data at a given column
custom_df2 = custom_df2.append({"Bills": 1000, "Age": 40}, ignore_index=True) # add extra row for later use
custom_df2 = custom_df2.append({"Bills": 1000, "Age": 36}, ignore_index=True) # add extra row for later use
custom_df2.sort_values(by="Bills", ascending=True)

custom_df2

In [None]:
# sort data based on multiple columns 
#   (once more rows have the same value, data will be sorted based on next column)

# if you want to have sorted by specified columns and have different orders (ascend./descend)
#   for each one, add a list of bools into ascending parameter 
custom_df2.sort_values(by=["Bills", "Age"], ascending=[False, True], inplace=True)



In [None]:
# sort back based on number of index column
custom_df2.sort_index()

In [None]:
# sort simply just one column
custom_df2["Salary"].sort_index()

In [None]:
# get largest value (or number of them) from a series
custom_df2["Salary"].nlargest(2)

In [None]:
# get largest value of a series but return whole dataset
custom_df2.nlargest(2, "Salary")

In [None]:
# smallest numbers (just rlly oposite of nlargest)
custom_df2.nsmallest(2, "Salary")

# Grouping and Aggregating - Analyzing and Exploring Your Data
aggregation = combining multiple parts of data into single result (mean, median...)

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/Ismaril/training/main/Machine%20learning%20Deep%20learning/csv%20files/Data%20Science%20Jobs%20Salaries.csv")
df.head()

In [None]:
df.info()

In [None]:
# median (ignores na values)
df.median()

In [None]:
# describe, some useful statistics info
# count = number of not NAN rows
df.describe().T

In [None]:
df["salary"].describe()

In [None]:
# summary of certain values in a given column
df["employment_type"].value_counts()

In [None]:
# summary of certain values in a given column as percentage
df["employment_type"].value_counts(normalize=True)

In [None]:
# groupby = spliting the data, applying a function, combining the results


# group by employee residence
country_group = df.groupby(["employee_residence"])

# visualise (actually this will return the same result as if we filtered it with loc iloc)
country_group.get_group("US").head(3)

In [None]:
country_group["salary_in_usd"].median().head()

In [None]:
# use aggregate function to apply multiple functions on a given 'groupby' dataset
country_group["salary_in_usd"].agg(["median", "min"]).head()

In [None]:
# check only US
country_group["salary_in_usd"].agg(["median", "min"]).loc["US"]

In [None]:
# filter out sum of number of reasearches by country

# note that on group object, you have to use apply method and inside it a function
_ = country_group["job_title"].apply(lambda x: x.str.contains("Research").sum())

df_ = _.copy()
# filter out rows that do not have researchers
filter = _ != False
_.loc[filter]

# Cleaning Data - Casting Datatypes and Handling Missing Values

In [None]:
df = custom_df2.append({"first name":"Kuro",
                        "Mail": "NA",
                        "Age": "NA",
                        "Sex": "NA",
                        "Salary": "NA",
                        "Country": "NA",
                        "Full name": "missing",
                        "last name": "missing",
                        "Bills": "missing"}, ignore_index=True)
df = df.append({"first name":"Zdeno", "Mail": "Missing"}, ignore_index=True)
df = df.append({"first name":"Jaro","Age": None, "Country": None}, ignore_index=True)

df

In [None]:
df.dropna()

In [None]:
# what dropna has actually as default param

In [None]:
5+5