# Documentation
https://pandas.pydata.org/docs/

In [None]:
pip install pandas



Import pandas and identify the version

In [None]:
# pip install pandas
import pandas as pd
pd.__version__

'2.1.4'

Creating Series

In [None]:
obj = pd.Series(["Apple", "Banana", "Orange"])
obj

Unnamed: 0,0
0,Apple
1,Banana
2,Orange


In [None]:
print(obj[0])

Apple


In [None]:
obj = pd.Series(["Apple", "Banana", "Orange"],index=["A", "B","O"])
obj


Unnamed: 0,0
A,Apple
B,Banana
O,Orange


In [None]:
print(obj["A"])

Apple


In [None]:
# Creating a Series from a list
data_list = [10, 20, 30, 40]
series_from_list = pd.Series(data_list)

# Displaying the Series
print("Series from List:")
print(series_from_list)

Series from List:
0    10
1    20
2    30
3    40
dtype: int64


In [None]:

# Creating a Series from a list with custom index
data_list = [10, 20, 30, 40]
custom_index = ['a', 'b', 'c', 'd']

series_custom_index = pd.Series(data_list, index=custom_index)

# Displaying the Series with custom index
print("Series with Custom Index:")
print(series_custom_index)

Series with Custom Index:
a    10
b    20
c    30
d    40
dtype: int64


In [None]:

# Creating a Series with custom index from dictionary
data_dict = {'A': "Apple", 'B': "Banana", 'C': "Cherry", 'D': "Dragon Fruit"}
series_from_dict = pd.Series(data_dict)

# Displaying the Series with custom index from dict
print("\nSeries with Custom Index from dict:")
print(series_from_dict)

print   (series_from_dict['D'])



Series with Custom Index from dict:
A           Apple
B          Banana
C          Cherry
D    Dragon Fruit
dtype: object
Dragon Fruit


Creating a Series with heterogeneous data types

In [None]:
import pandas as pd

# Creating a Series with heterogeneous data types
heterogeneous_data = [1, 'apple', 3.14, True, None]

series_heterogeneous_datatypes = pd.Series(heterogeneous_data)

# Displaying the Series
print("Series with Heterogeneous Data Types:")
print(series_heterogeneous_datatypes)


Series with Heterogeneous Data Types:
0        1
1    apple
2     3.14
3     True
4     None
dtype: object


Creating Dataframe

In [None]:
# Creating a DataFrame from a dictionary
data_dict = {'Name': ['Alice', 'Bob', 'Charlie'],
             'Age': [25, 30, 22],
             'City': ['New York', 'San Francisco', 'Los Angeles']}
df_from_dict = pd.DataFrame(data_dict)

# Displaying the DataFrame
print("\nDataFrame from Dictionary:")
print(df_from_dict)


DataFrame from Dictionary:
      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2  Charlie   22    Los Angeles


In [None]:

# Creating a DataFrame from a list of dictionaries
data_list_of_dict = [{'Name': 'Alice', 'Age': 25, 'City': 'New York'},
                     {'Name': 'Bob', 'Age': 30, 'City': 'San Francisco'},
                     {'Name': 'Charlie', 'Age': 22, 'City': 'Los Angeles'}]
df_from_list_of_dict = pd.DataFrame(data_list_of_dict)

# Displaying the DataFrame
print("\nDataFrame from List of Dictionaries:")
print(df_from_list_of_dict)



DataFrame from List of Dictionaries:
      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2  Charlie   22    Los Angeles


In [None]:
# Creating a DataFrame from a list of dictionaries
data_list_of_dict = [{'Name': 'Alice', 'Age': 25, 'City': 'New York'},
                     {'Name': 'Bob', 'Age': 'NA', 'City': None},
                     {'Name': 'Charlie', 'Age': 22, 'City': 'Los Angeles'}]
df_from_list_of_dict = pd.DataFrame(data_list_of_dict)

# Displaying the DataFrame
print("\nDataFrame from List of Dictionaries:")
print(df_from_list_of_dict)


DataFrame from List of Dictionaries:
      Name Age         City
0    Alice  25     New York
1      Bob  NA         None
2  Charlie  22  Los Angeles


Accessing elements in series

In [None]:
import pandas as pd

# Creating a Series
data = [10, 20, 30, 40, 50.5]
index_labels = ['a', 'b', 'c', 'd', "e"]
pd_series = pd.Series(data, index=index_labels)
print (pd_series)

# Accessing a single element by index label
element_a = pd_series['a']
print(f"Element at index 'a': {element_a}")

# Accessing multiple elements by index labels using a list
elements_b_and_c = pd_series[['b', 'c']]
print(f"Elements at indices 'b' and 'c':\n{elements_b_and_c}")

# Accessing elements based on conditions (e.g., values greater than 20)
elements_greater_than_20 = pd_series[pd_series > 20]
print(f"Elements greater than 20:\n{elements_greater_than_20}")


a    10.0
b    20.0
c    30.0
d    40.0
e    50.0
dtype: float64
Element at index 'a': 10.0
Elements at indices 'b' and 'c':
b    20.0
c    30.0
dtype: float64
Elements greater than 20:
c    30.0
d    40.0
e    50.0
dtype: float64


Accessing elements in dataframe

.loc[] - label-based indexing

.iloc[] - integer-based indexing in DataFrames


In [None]:
# Creating a DataFrame
data_dict = {'Name': ['Alice', 'Bob', 'Charlie'],
             'Age': [25, 30, 22],
             'City': ['New York', 'San Francisco', 'Los Angeles']}
df = pd.DataFrame(data_dict)
print("\nDataframe: \n", df)

# Accessing a single column by column name
name_column = df['Name']
print("\nName Column:\n", name_column)

# Accessing a single element by specifying row and column using loc
element_bob_age = df.loc[1, 'Age']
print("\nElement at row 1, column 'Age':", element_bob_age)

# Accessing a row using iloc
row_bob = df.iloc[1]
print("\nRow for 'Bob':\n", row_bob)



Dataframe: 
       Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2  Charlie   22    Los Angeles

Name Column:
 0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object

Element at row 1, column 'Age': 30

Row for 'Bob':
 Name              Bob
Age                30
City    San Francisco
Name: 1, dtype: object


In [None]:
# Creating a DataFrame
data_dict = {'Name': ['Alice', 'Bob', 'Charlie'],
             'Age': [25, 30, 22],
             'City': ['New York', 'San Francisco', 'Los Angeles']}
df = pd.DataFrame(data_dict)
print("\nDataframe: \n", df)

# Change Name as index
df.set_index('Name', inplace=True)
print("\nDataframe: \n", df)

# Accessing a single column by column name
city_column = df['City']
print("\nCity Column:\n", city_column)

# Accessing a single element by specifying row and column using loc
element_bob_age = df.loc['Bob', 'Age']
print("\nElement at row 1, column 'Age':", element_bob_age)

# Accessing a row using iloc
row_bob = df.iloc[1]
print("\nRow for 'Bob':\n", row_bob)


Dataframe: 
       Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2  Charlie   22    Los Angeles

Dataframe: 
          Age           City
Name                       
Alice     25       New York
Bob       30  San Francisco
Charlie   22    Los Angeles

City Column:
 Name
Alice           New York
Bob        San Francisco
Charlie      Los Angeles
Name: City, dtype: object

Element at row 1, column 'Age': 30

Row for 'Bob':
 Age                30
City    San Francisco
Name: Bob, dtype: object


# Data Exploration - Display the basic information

In [None]:
import pandas as pd

# Creating a DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Ford'],
        'Age': [25, 30, 22, 40, 18, 20],
        'Salary': [2500, 3000, 2200, 4000, 1800, 2000],
        'City': ['New York', 'San Francisco', 'Los Angeles', 'Folarida', 'Chicago', 'New York']}
df = pd.DataFrame(data)


# Display basic information about the DataFrame
print("DataFrame Information:")
print(df.info())


DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    6 non-null      object
 1   Age     6 non-null      int64 
 2   Salary  6 non-null      int64 
 3   City    6 non-null      object
dtypes: int64(2), object(2)
memory usage: 320.0+ bytes
None


In [None]:
import pandas as pd
import numpy as np
# Creating a DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Ford'],
        'Age': [25, 30, 22, 40, 18, 20],
        'Salary': [2500, 3000, 2200, 4000, 1800, np.NaN],
        'City': ['New York', 'San Francisco', None, 'Folarida', 'Chicago', 'New York']}
df = pd.DataFrame(data)


# Display basic information about the DataFrame
print("DataFrame Information:")
print(df.info())

DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    6 non-null      object 
 1   Age     6 non-null      int64  
 2   Salary  5 non-null      float64
 3   City    5 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 320.0+ bytes
None


# Data Exploration - Descriptive Statistics

In [None]:
# Display basic descriptive statistics of numeric columns
print("\nDescriptive Statistics:")
print(df.describe())

'''
count: The number of non-null values in the 'Age' column (in this case, 6).

mean: The average value of the 'Age' column (25.83).

std: The standard deviation, a measure of the amount of variation or dispersion in the 'Age' values (8.11).

min: The minimum value in the 'Age' column (18).

25% (Q1): The first quartile, or 25th percentile. It is the value below which 25% of the data falls (20.5).

50% (median or Q2): The second quartile, or 50th percentile (23.5). It is the middle value of the dataset.

75% (Q3): The third quartile, or 75th percentile. It is the value below which 75% of the data falls (28.75).

max: The maximum value in the 'Age' column (40).
'''


Descriptive Statistics:
             Age       Salary
count   6.000000     6.000000
mean   25.833333  2583.333333
std     8.109665   810.966502
min    18.000000  1800.000000
25%    20.500000  2050.000000
50%    23.500000  2350.000000
75%    28.750000  2875.000000
max    40.000000  4000.000000


"\ncount: The number of non-null values in the 'Age' column (in this case, 6).\n\nmean: The average value of the 'Age' column (25.83).\n\nstd: The standard deviation, a measure of the amount of variation or dispersion in the 'Age' values (8.11).\n\nmin: The minimum value in the 'Age' column (18).\n\n25% (Q1): The first quartile, or 25th percentile. It is the value below which 25% of the data falls (20.5).\n\n50% (median or Q2): The second quartile, or 50th percentile (23.5). It is the middle value of the dataset.\n\n75% (Q3): The third quartile, or 75th percentile. It is the value below which 75% of the data falls (28.75).\n\nmax: The maximum value in the 'Age' column (40).\n"

In [None]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,6.0,25.833333,8.109665,18.0,20.5,23.5,28.75,40.0
Salary,6.0,2583.333333,810.966502,1800.0,2050.0,2350.0,2875.0,4000.0



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



# Data Exploration - Unique Values

In [None]:
import pandas as pd

# Creating a DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Ford'],
        'Age': [25, 30, 22, 40, 18, 20],
        'Salary': [2500, 3000, 2200, 4000, 1800, 2000],
        'City': ['New York', 'San Francisco', 'Los Angeles', 'Folarida', 'Chicago', 'New York']}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary,City
0,Alice,25,2500,New York
1,Bob,30,3000,San Francisco
2,Charlie,22,2200,Los Angeles
3,David,40,4000,Folarida
4,Eva,18,1800,Chicago
5,Ford,20,2000,New York


In [None]:
# Display unique values in a column
unique_cities = df['City'].unique()
print("\nUnique Cities:", unique_cities)



Unique Cities: ['New York' 'San Francisco' 'Los Angeles' 'Folarida' 'Chicago']


In [None]:
# Display unique values count in a column
nunique_cities = df['City'].nunique()
print("\nUnique Cities:", nunique_cities)


Unique Cities: 5


# Data Exploration - Unique Values Count

In [None]:
# Display the count of each unique value in a column
city_counts = df['City'].value_counts()
print("\nCity Value Counts:\n", city_counts)



City Value Counts:
 City
New York         2
San Francisco    1
Los Angeles      1
Folarida         1
Chicago          1
Name: count, dtype: int64


# Data Manipulation - Selecting Columns

In [None]:
import pandas as pd

# Creating a DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Ford'],
        'Age': [25, 30, 22, 40, 18, 20],
        'City': ['New York', 'San Francisco', 'Los Angeles', 'Folarida', 'Chicago', 'New York']}
df = pd.DataFrame(data)

# Selecting a single column
name_column = df['Name']
print("Name Column:\n", name_column)

# Selecting multiple columns
selected_columns = df[['Name', 'Age']]
print("\nSelected Columns:\n", selected_columns)


# Selecting multiple columns
selected_columns = df[['Name', 'City']]
print("\nSelected Columns:\n", selected_columns)


Name Column:
 0      Alice
1        Bob
2    Charlie
3      David
4        Eva
5       Ford
Name: Name, dtype: object

Selected Columns:
       Name  Age
0    Alice   25
1      Bob   30
2  Charlie   22
3    David   40
4      Eva   18
5     Ford   20

Selected Columns:
       Name           City
0    Alice       New York
1      Bob  San Francisco
2  Charlie    Los Angeles
3    David       Folarida
4      Eva        Chicago
5     Ford       New York


# Data Manipulation - Filtering Rows

In [None]:
# Creating a DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Ford'],
        'Age': [25, 30, 22, 40, 18, 20],
        'City': ['New York', 'San Francisco', 'Los Angeles', 'Folarida', 'Chicago', 'New York']}
df = pd.DataFrame(data)

# Filtering rows based on a condition
young_people = df[df['Age'] < 30]
print("\nYoung People:\n", young_people)

young_people = df[df['City'] == 'New York']
print("\nYoung People:\n", young_people)



Young People:
       Name  Age         City
0    Alice   25     New York
2  Charlie   22  Los Angeles
4      Eva   18      Chicago
5     Ford   20     New York

Young People:
     Name  Age      City
0  Alice   25  New York
5   Ford   20  New York


# Data Manipulation - Grouping and Aggregating

In [None]:
# Grouping by a column and calculating mean age for each group
grouped_data = df.groupby('City')['Age'].mean()
print("\nMean Age by City:\n", grouped_data)




Mean Age by City:
 City
Chicago          18.0
Folarida         40.0
Los Angeles      22.0
New York         22.5
San Francisco    30.0
Name: Age, dtype: float64


# Data Manipulation - Merging Dataframes




In [None]:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Ford'],
        'Age': [25, 30, 22, 40, 18, 20],
        'City': ['New York', 'San Francisco', 'Los Angeles', 'Folarida', 'Chicago', 'New York']}
df = pd.DataFrame(data)
print (df)
print ()
# Creating another DataFrame
additional_data = {'City': ['New York', 'San Francisco', 'Los Angeles'],
                   'Population': [8_398_748, 884_363, 3_979_576]}
df_population = pd.DataFrame(additional_data)
print (df_population)
print ()
# Merging DataFrames based on a common column (City)
merged_df = pd.merge(df, df_population, on='City')
print("\nMerged DataFrame:\n", merged_df)

      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2  Charlie   22    Los Angeles
3    David   40       Folarida
4      Eva   18        Chicago
5     Ford   20       New York

            City  Population
0       New York     8398748
1  San Francisco      884363
2    Los Angeles     3979576


Merged DataFrame:
       Name  Age           City  Population
0    Alice   25       New York     8398748
1     Ford   20       New York     8398748
2      Bob   30  San Francisco      884363
3  Charlie   22    Los Angeles     3979576


# Data Manipulation - Handling Missing Data

In [None]:
import pandas as pd

# Creating a DataFrame with missing values
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Ford', 'Hendry', 'George', 'Harry', 'Xavier'],
        'Age': [25, 30, 50, 60, 28, 25, 30, None, None, 28],
        'City': ['New York', 'San Francisco', 'Los Angeles', None, 'Miami', 'New York', 'San Francisco', 'Los Angeles', None, 'Miami']}

       # 'Age': [20, None, None, None, None, None, None, None, None, None],
       # 'City': [None, None, None, None, None, None, None, None, None, 'Miami']}

        # 'Age': [25, 30, None, None, 28, 25, 30, None, None, 28],
        # 'City': ['New York', 'San Francisco', 'Los Angeles', None, 'Miami', 'New York', 'San Francisco', 'Los Angeles', None, 'Miami']}
df = pd.DataFrame(data)


print(df)
print (len(df))
# Checking for missing values
print("Missing Values:\n", df.isnull())

print("Missing Values:\n", df.isnull().sum())

df.info()


      Name   Age           City
0    Alice  25.0       New York
1      Bob  30.0  San Francisco
2  Charlie  50.0    Los Angeles
3    David  60.0           None
4      Eve  28.0          Miami
5     Ford  25.0       New York
6   Hendry  30.0  San Francisco
7   George   NaN    Los Angeles
8    Harry   NaN           None
9   Xavier  28.0          Miami
10
Missing Values:
     Name    Age   City
0  False  False  False
1  False  False  False
2  False  False  False
3  False  False   True
4  False  False  False
5  False  False  False
6  False  False  False
7  False   True  False
8  False   True   True
9  False  False  False
Missing Values:
 Name    0
Age     2
City    2
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    10 non-null     object 
 1   Age     8 non-null      float64
 2   City    8 non-null      object 
dtypes: float64(1), object(2)

In [None]:

# Set the threshold for missing values (30% in this example)
missing_threshold = 0.30

# Drop columns with a large number of missing values
df_after = df.dropna(thresh=(1 - missing_threshold) * len(df), axis=1)
print (df_after)
# Checking for missing values
print("Missing Values:\n", df_after.isnull())

print("Missing Values:\n", df_after.isnull().sum())



      Name   Age           City
0    Alice  25.0       New York
1      Bob  30.0  San Francisco
2  Charlie  50.0    Los Angeles
3    David  60.0           None
4      Eve  28.0          Miami
5     Ford  25.0       New York
6   Hendry  30.0  San Francisco
7   George   NaN    Los Angeles
8    Harry   NaN           None
9   Xavier  28.0          Miami
Missing Values:
     Name    Age   City
0  False  False  False
1  False  False  False
2  False  False  False
3  False  False   True
4  False  False  False
5  False  False  False
6  False  False  False
7  False   True  False
8  False   True   True
9  False  False  False
Missing Values:
 Name    0
Age     2
City    2
dtype: int64


In [None]:
# Counting missing values in each column
missing_values_per_column = df.isnull().sum()
print("\nMissing Values per Column:\n", missing_values_per_column)



Missing Values per Column:
 Name    0
Age     2
City    2
dtype: int64


In [None]:
# Counting missing values in each column
missing_values_per_column = df.isnull().sum(axis=0)
print("\nMissing Values per Column:\n", missing_values_per_column)


Missing Values per Column:
 Name    0
Age     2
City    2
dtype: int64


In [None]:
# Counting missing values in each row
missing_values_per_row = df.isnull().sum(axis=1)
print("\nMissing Values per Row:\n", missing_values_per_row)



Missing Values per Row:
 0    0
1    0
2    2
3    2
4    0
5    0
6    0
7    1
8    2
9    0
dtype: int64


In [None]:
# Dropping rows with missing values
df_dropped_rows = df.dropna()
print("\nDataFrame after Dropping Rows with Missing Values:\n", df_dropped_rows)



DataFrame after Dropping Rows with Missing Values:
       Name   Age           City
0    Alice  25.0       New York
1      Bob  30.0  San Francisco
2  Charlie  50.0    Los Angeles
4      Eve  28.0          Miami
5     Ford  25.0       New York
6   Hendry  30.0  San Francisco
9   Xavier  28.0          Miami


In [None]:
# Dropping columns with missing values
# Creating a DataFrame with missing values
data = {'Name': ['Alice', 'Bob', None, 'David', 'Eve'],
        'Age': [25, 30, None, 35, 28],
        'City': ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Miami']}
df = pd.DataFrame(data)
df_dropped_columns = df.dropna(axis=1)
print("\nDataFrame after Dropping Columns with Missing Values:\n", df_dropped_columns)

# Identifying missing columns
missing_columns = df.columns[df.isnull().any()]
print("\nMissing Columns:", missing_columns)


DataFrame after Dropping Columns with Missing Values:
             City
0       New York
1  San Francisco
2    Los Angeles
3        Chicago
4          Miami

Missing Columns: Index(['Name', 'Age'], dtype='object')


In [None]:
data = {'Name': ['Alice', 'Bob', None, 'David', 'Eve'],
        'Age': [25, 30, None, 35, 28],
        'City': ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Miami']}
df = pd.DataFrame(data)
print (df)

df_filled_mean = df.fillna(df['Age'].mean())
print("\nDataFrame after Filling Missing Values with Mean:\n", df_filled_mean)

df['Age'].fillna(df['Age'].mean(), inplace=True)
print("\nDataFrame after Filling Missing Values with Mean:\n", df)


    Name   Age           City
0  Alice  25.0       New York
1    Bob  30.0  San Francisco
2   None   NaN    Los Angeles
3  David  35.0        Chicago
4    Eve  28.0          Miami

DataFrame after Filling Missing Values with Mean:
     Name   Age           City
0  Alice  25.0       New York
1    Bob  30.0  San Francisco
2   29.5  29.5    Los Angeles
3  David  35.0        Chicago
4    Eve  28.0          Miami

DataFrame after Filling Missing Values with Mean:
     Name   Age           City
0  Alice  25.0       New York
1    Bob  30.0  San Francisco
2   None  29.5    Los Angeles
3  David  35.0        Chicago
4    Eve  28.0          Miami


In [None]:
# Filling missing values with a specific value (e.g., mean)



# Dropping rows with missing values



Alice - ture
Bob   - false
Charlie -


Item 1 - 1
Item 2 - 2

Item 3 - 0 , mean

In [None]:
# Creating a DataFrame with missing values
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Age': [25, 30, None, 35, 28],
        'City': ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Miami']}
df = pd.DataFrame(data)


# Adding missing data to the DataFrame
df.loc[1, 'Age'] = None
print ("\n Dataframe with missing value \n", df)

# Checking for missing values
print("\nMissing Values:\n", df.isnull())

# Handling missing values (filling with mean)
df['Age'].fillna(df['Age'].mean(), inplace=True)
print("\nDataFrame after Handling Missing Values:\n", df)




 Dataframe with missing value 
       Name   Age           City
0    Alice  25.0       New York
1      Bob   NaN  San Francisco
2  Charlie   NaN    Los Angeles
3    David  35.0        Chicago
4      Eve  28.0          Miami

Missing Values:
     Name    Age   City
0  False  False  False
1  False   True  False
2  False   True  False
3  False  False  False
4  False  False  False

DataFrame after Handling Missing Values:
       Name        Age           City
0    Alice  25.000000       New York
1      Bob  29.333333  San Francisco
2  Charlie  29.333333    Los Angeles
3    David  35.000000        Chicago
4      Eve  28.000000          Miami


In [None]:
import pandas as pd
import numpy as np

# Original data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, "", 35, 28],
    'City': ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Miami']
}

# Create DataFrame
df = pd.DataFrame(data)
df


Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,San Francisco
2,Charlie,,Los Angeles
3,David,35.0,Chicago
4,Eve,28.0,Miami


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      object
 2   City    5 non-null      object
dtypes: object(3)
memory usage: 248.0+ bytes


In [None]:
# Replace empty strings with NaN
df['Age'] = df['Age'].replace("", np.nan)
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,San Francisco
2,Charlie,,Los Angeles
3,David,35.0,Chicago
4,Eve,28.0,Miami


In [None]:
# Convert 'Age' column to numeric (it will convert NaN automatically)
df['Age'] = pd.to_numeric(df['Age'])
df.info()
print()
print (df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     4 non-null      float64
 2   City    5 non-null      object 
dtypes: float64(1), object(2)
memory usage: 248.0+ bytes

      Name   Age           City
0    Alice  25.0       New York
1      Bob  30.0  San Francisco
2  Charlie   NaN    Los Angeles
3    David  35.0        Chicago
4      Eve  28.0          Miami


In [None]:

# Calculate mean of 'Age' column, ignoring NaN values
mean_age = df['Age'].mean()

# Fill NaN values with the mean
df['Age'].fillna(mean_age, inplace=True)

print(df)


      Name   Age           City
0    Alice  25.0       New York
1      Bob  30.0  San Francisco
2  Charlie  29.5    Los Angeles
3    David  35.0        Chicago
4      Eve  28.0          Miami


# Sorting

In [None]:
# Sorting the DataFrame by a specific column
print("\nOriginal Data:\n", df)

sorted_df = df.sort_values('Age')
print("\nSorted DataFrame by Age:\n", sorted_df)



Original Data:
       Name   Age           City
0    Alice  25.0       New York
1      Bob  30.0  San Francisco
2  Charlie   NaN    Los Angeles
3    David  35.0        Chicago
4      Eve  28.0          Miami

Sorted DataFrame by Age:
       Name   Age           City
0    Alice  25.0       New York
4      Eve  28.0          Miami
1      Bob  30.0  San Francisco
3    David  35.0        Chicago
2  Charlie   NaN    Los Angeles


Load CSV dataset into Pandas dataframe

In [None]:
df = pd.read_csv('/content/titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:

# Selecting the top 5 rows
top_five_rows = df.head()
print("Top 5 Rows:\n", top_five_rows)




Top 5 Rows:
   SI.No.      State/UT/City  Estimated Mid Year Population  Rape - Incidence  \
0      1     Andhra Pradesh                          758.5               944   
1      2  Arunachal Pradesh                           12.0                36   
2      3              Assam                          263.0               762   
3      4              Bihar                         1005.6              1570   
4      5                Goa                           16.1                21   

   Rape - Rate  Rape - percentage contribution to All India  \
0          1.2                                          5.7   
1          3.0                                          0.2   
2          2.9                                          4.6   
3          1.6                                          9.5   
4          1.3                                          0.1   

   Kidnapping & Abduction - Incidence  Kidnapping & Abduction - Rate  \
0                                 716                  

In [None]:
# Selecting the top 3 rows
top_three_rows = df.head(3)
print("Top 3 Rows:\n", top_three_rows)

Top 3 Rows:
   SI.No.      State/UT/City  Estimated Mid Year Population  Rape - Incidence  \
0      1     Andhra Pradesh                          758.5               944   
1      2  Arunachal Pradesh                           12.0                36   
2      3              Assam                          263.0               762   

   Rape - Rate  Rape - percentage contribution to All India  \
0          1.2                                          5.7   
1          3.0                                          0.2   
2          2.9                                          4.6   

   Kidnapping & Abduction - Incidence  Kidnapping & Abduction - Rate  \
0                                 716                            0.9   
1                                  42                            3.5   
2                                1101                            4.2   

   Kidnapping & Abduction - percentage contribution to All India  \
0                                                4.8    

In [None]:

# Selecting the bottom 5 rows
bottom_five_rows = df.tail()
print("Bottom 5 Rows:\n", bottom_five_rows)



Bottom 5 Rows:
           SI.No.   State/UT/City  Estimated Mid Year Population  \
54            52           Surat                           24.4   
55            53         Vadodra                           16.6   
56            54        Varanasi                           13.1   
57            55  Vishakhapatnam                           17.8   
58  Total Cities    Total Cities                          985.4   

    Rape - Incidence  Rape - Rate  \
54                16          0.7   
55                 6          0.4   
56                12          0.9   
57                14          0.8   
58              1078          1.1   

    Rape - percentage contribution to All India  \
54                                          1.5   
55                                          0.6   
56                                          1.1   
57                                          1.3   
58                                        100.0   

    Kidnapping & Abduction - Incidence  Kidnapping 

In [None]:
# Selecting the bottom 3 rows
bottom_three_rows = df.tail(3)
print("Bottom 3 Rows:\n", bottom_three_rows)

Bottom 3 Rows:
           SI.No.   State/UT/City  Estimated Mid Year Population  \
56            54        Varanasi                           13.1   
57            55  Vishakhapatnam                           17.8   
58  Total Cities    Total Cities                          985.4   

    Rape - Incidence  Rape - Rate  \
56                12          0.9   
57                14          0.8   
58              1078          1.1   

    Rape - percentage contribution to All India  \
56                                          1.1   
57                                          1.3   
58                                        100.0   

    Kidnapping & Abduction - Incidence  Kidnapping & Abduction - Rate  \
56                                  23                            1.8   
57                                   8                            0.4   
58                                1895                            1.9   

    Kidnapping & Abduction - percentage contribution to All India  \

In [None]:
import pandas as pd
df = pd.read_csv('/content/titanic.csv')
df.info()
df.describe()
df
top5 = df.head(10)
print (top5)

bottom5 =df.tail(3)
print (bottom5)

# Identifying missing columns
missing_columns = df.columns[df.isnull().any()]
print("\nMissing Columns:", missing_columns)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

Missing Columns: Index(['Age', 'Cabin', 'Embarked'], dtype='object')


In [None]:
# Identifying missing columns
missing_columns = df.columns[df.isnull().any()]
print("\nMissing Columns:", missing_columns)


Missing Columns: Index(['Age', 'Cabin', 'Embarked'], dtype='object')


In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

# read csv using pandas
# train = pd.read_csv('/content/fraudTrain.csv')


Mounted at /content/drive


In [None]:
# read csv using pandas
df = pd.read_csv('/content/drive/MyDrive/Dataset/covid_19_india.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35995 entries, 0 to 35994
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              35995 non-null  int64  
 1   Province_State  15295 non-null  object 
 2   Country_Region  35995 non-null  object 
 3   Date            35995 non-null  object 
 4   ConfirmedCases  35995 non-null  float64
 5   Fatalities      35995 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 1.6+ MB


In [None]:
#Display the first 5 rows of the dataset to get an overview.
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
