In [8]:
import pandas as pd
data= [10,20,30]
ser  = pd.Series(data)
print(ser)
print(ser[2])

0    10
1    20
2    30
dtype: int64
30


In [10]:
# With the index argument, you can name your own labels.

ser = pd.Series(data, index=['a','b','c'])
print(ser)
print(ser['c'])

a    10
b    20
c    30
dtype: int64
30


In [15]:
# You can also use a key/value object, like a dictionary, when creating a Series.

import pandas as pd

VarVal = {"ValueOne": 100, "ValueTwo": 200, "ValueThree": 300}

serVal = pd.Series(VarVal)

print(serVal)
print(serVal['ValueOne'])
print(serVal[0])

ValueOne      100
ValueTwo      200
ValueThree    300
dtype: int64
100
100


In [16]:
# To select only some of the items in the dictionary, use the index argument and specify only the items you want to include in the Series.

import pandas as pd

VarVal = {"ValueOne": 100, "ValueTwo": 200, "ValueThree": 300}

serVal = pd.Series(VarVal, index=['ValueOne', 'ValueTwo'])
print(serVal)


ValueOne    100
ValueTwo    200
dtype: int64


In [42]:
# DataFrames
# Data sets in Pandas are usually multi-dimensional tables, called DataFrames.

# Series is like a column, a DataFrame is the whole table.

import pandas as pd

d1 = {
    "Name":["XXX","YYY","ZZZ"],
    "Age":[10,20,30],
    "Location":["Chennai","Delhi","Kerala"]
}
df = pd.DataFrame(d1)
print(df)


  Name  Age Location
0  XXX   10  Chennai
1  YYY   20    Delhi
2  ZZZ   30   Kerala


In [43]:
# loc = returning the row values

# return row 1 value
print(df.loc[1])

# return/selecting row 0 & 1

print(df.loc[[0,1]])

#return/selecting the two column values
print(df[['Name',"Location"]])

Name          YYY
Age            20
Location    Delhi
Name: 1, dtype: object
  Name  Age Location
0  XXX   10  Chennai
1  YYY   20    Delhi
  Name Location
0  XXX  Chennai
1  YYY    Delhi
2  ZZZ   Kerala


In [40]:
# Specific Index name if wants to add - Named Index

df = pd.DataFrame(d1, index=['a','b','c'])
print(df)

#return the named index a value
print(df.loc['a'])


  Name  Age Location
a  XXX   10  Chennai
b  YYY   20    Delhi
c  ZZZ   30   Kerala
Name            XXX
Age              10
Location    Chennai
Name: a, dtype: object


In [None]:
#Reading the CSV
import pandas as pd

df = pd.read_csv('/content/sample_data/data.csv')
#will return first & last 5 rows
print(df)
#will return the entire data
print(df.to_string())

In [None]:
#Analyzing the data

print(df.head(10))
print(df.head())
print(df.tail())
print(df.tail(5))
print(df.info())


In [None]:
# Data cleaning means fixing bad data in your data set.

# Bad data could be:

# Empty cells
# Data in wrong format
# Wrong data
# Duplicates


#HANDLING THE EMPTY CELLS

import pandas as pd
df = pd.read_csv("data.csv")
new_df = pd.dropna()  # it'll create the newDataframe by removing the empty cells
pd.dropna(inplace=True) #It'll remove the empty cells on the original file itself

pd.fillna(205, inplace=True) # it'll replace the empty values with 205.

df['variety'].fillna("iris-setosa", inplace=True)

#We can use the mean & median values to replace

valToReplaceWithMean = df['petal.width'].mean()

valToReplaceWithMedian = df['petal.width'].median()

df['petal.width'].fillna(valToReplaceWithMean, inplace=True)

In [None]:
# HANDLING THE WRONG FORMAT DATA

import pandas as pd

df = pd.read_csv('/content/sample_data/data.csv')
df.dropna(subset=['sepal.width'], inplace=True)  #removing the NULL values from the 'sepal.width' column and updating the same dataframe
df['sepal.width'] = pd.to_numeric(df['sepal.width'])

# df['Date'] = pd.to_datetime(df['Date'])  #changing the format of column (In data.csv, we don't have datatime format)
print(df)

In [None]:
# HANDLING THE WRONG DATA

import pandas as pd

df = pd.read_csv("data.csv")

df.loc[4, "sepal.length"] = 5.2  # 4th row index and selecting column sepal.lenght and changing the value to 5.2
df.loc[4] = 5.2  # it'll change the entire 4th row index values
print(df)

for i in df.index:
  if df.loc[i, 'sepal.width'] > 6.1:
    df.loc[i, 'sepal.width'] = 6.0 #option 1
    df.drop(i, inplace=True) #option 2


In [None]:
# HANDLING THE DUPLICATE VALUES

import pandas as pd

df = pd.read_csv('data.csv')

df.duplicated()  # Returns True for every row that is a duplicate, othwerwise False

df.drop_duplicates(inplace=True) # it'll remove all the duplicates.



In [13]:
# FINDING THE CORRELATION RELATION SHIP WITH EACH COLUMNS

# The Result of the corr() method is a table with a lot of numbers that represents how well the relationship is between two columns.

# The number varies from -1 to 1.

# 1 means that there is a 1 to 1 relationship (a perfect correlation), and for this data set, each time a value went up in the first column, the other one went up as well.

# 0.9 is also a good relationship, and if you increase one value, the other will probably increase as well.

# -0.9 would be just as good relationship as 0.9, but if you increase one value, the other will probably go down.

# 0.2 means NOT a good relationship, meaning that if one value goes up does not mean that the other will.

import pandas as pd

df = pd.read_csv('data.csv')
df.corr()

  df.corr()


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
sepal.length,1.0,-0.109369,0.871754,0.817954
sepal.width,-0.109369,1.0,-0.420516,-0.356544
petal.length,0.871754,-0.420516,1.0,0.962757
petal.width,0.817954,-0.356544,0.962757,1.0


In [None]:
# PLOTTING USING PANDAS (Jus for knowledge)

import pandas as pd
import matplotlib as plt

df = pd.read_csv('data.csv')

df.plot(kind='scatter',x='petal.width',y='petal.length')
df['sepal.width'].plot(kind='hist')
plt.show()

In [None]:
#OTHER IMPORTANT FUNCTIONS OF PANDAS

import pandas as pd

df = pd.read_csv('data.csv') # keeping the sepal.length column as index column
# print(df.loc[5.1])

# print(df.isnull())  #will return if any null values
# print(len(df.isnull()))

print(list(df))  # it'll return only the column names


