In [None]:
'''Pandas is one of the most powerful libraries in Python for data manipulation and analysis. It allows you to easily load, process, and analyze data in structured formats like CSV, Excel, SQL, JSON, and more. In data science, it’s used to clean, transform, and visualize data for deeper insights.

Let’s break it down into simple steps to understand how you can use Pandas for data science! 📊🔍

🧠 Core Concepts of Pandas
Series: A one-dimensional array (like a column in a table).

DataFrame: A two-dimensional table (like a spreadsheet or a database table) made of rows and columns.'''

#1. Installing Pandas
!pip install pandas

#2. Importing Pandas
import pandas as pd

In [1]:

#3. Creating a DataFrame
#A DataFrame is the heart of pandas. It is like a table with rows and columns, where each column can hold different types of data.
#Example of Creating a DataFrame:

import pandas as pd

# Creating a simple DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

df = pd.DataFrame(data)

print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [None]:
#4. Reading Data from Files (CSV, Excel)
#You can load data from various sources (CSV, Excel, etc.) using pandas:

#Reading from a CSV file:
df = pd.read_csv('data.csv')

#Reading from an Excel file:
df = pd.read_excel('data.xlsx')

# Preview the first few rows of the data
print(df.head())  # By default, it shows the first 5 rows

In [None]:
#5. Basic DataFrame Operations
#a. Accessing Columns --> You can access any column in a DataFrame by using its name:
# Accessing a column
ages = df['Age']
print(ages)

#b. Selecting Rows --> You can select specific rows using loc (label-based indexing) or iloc (index-based):
# Select a row by index (first row)
first_row = df.iloc[0]

# Select a row by label
row_1 = df.loc[0]

#c. Filtering Data --> You can filter rows based on conditions:
# Filtering rows where age is greater than 30
filtered_data = df[df['Age'] > 30]
print(filtered_data)

#d. Sorting Data --> You can sort data based on any column:
# Sorting by Age
df_sorted = df.sort_values('Age', ascending=False)
print(df_sorted)

In [None]:
#6. DataFrame Functions
#a. Summary Statistics --> Pandas provides useful functions for getting summary statistics:
# Summary statistics for numerical columns
print(df.describe())

# Get info about DataFrame (columns, data types, non-null counts)
print(df.info())

#b. Missing Data Handling
#Pandas allows you to easily handle missing data:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df_clean = df.dropna()

# Fill missing values with a specific value
df_filled = df.fillna(0)

#c. Groupby Operations
#You can group data by certain columns and apply functions like sum, mean, count, etc.
# Grouping by 'City' and calculating mean age
grouped = df.groupby('City')['Age'].mean()
print(grouped)

In [None]:
#7. Adding, Modifying, and Removing Columns
#You can add or modify columns in a DataFrame:

#a. Add a New Column
# Adding a new column
df['Salary'] = [50000, 60000, 70000]
print(df)

#b. Modify an Existing Column
# Modifying a column
df['Age'] = df['Age'] + 1  # Adding 1 to each value in the 'Age' column
print(df)

#c. Remove a Column
# Dropping a column
df = df.drop('Salary', axis=1)
print(df)

In [None]:
#8. Saving Data to Files
#Once you’ve processed your data, you can save it back to a file:

#a. Save to CSV
df.to_csv('output.csv', index=False)

#b. Save to Excel
df.to_excel('output.xlsx', index=False)

In [2]:
#Example Project: Analyzing Student Data
#Let's use a small dataset to perform basic data analysis.

#Step 1: Create a DataFrame
import pandas as pd

# Data for students
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Math': [90, 80, 70, 85, 95],
    'Science': [85, 88, 92, 75, 80],
    'English': [78, 85, 80, 90, 88]
}

df = pd.DataFrame(data)
print(df)

      Name  Math  Science  English
0    Alice    90       85       78
1      Bob    80       88       85
2  Charlie    70       92       80
3    David    85       75       90
4      Eva    95       80       88


In [3]:
#Step 2: Calculate Average Marks for Each Student
# Adding a new column 'Average' for the average marks
df['Average'] = df[['Math', 'Science', 'English']].mean(axis=1)
print(df)

      Name  Math  Science  English    Average
0    Alice    90       85       78  84.333333
1      Bob    80       88       85  84.333333
2  Charlie    70       92       80  80.666667
3    David    85       75       90  83.333333
4      Eva    95       80       88  87.666667


In [4]:
#Step 3: Find the Highest Scorer in Each Subject

# Find the student with the highest marks in each subject
highest_math = df.loc[df['Math'].idxmax()]
highest_science = df.loc[df['Science'].idxmax()]
highest_english = df.loc[df['English'].idxmax()]

print("Highest Math Scorer:", highest_math['Name'])
print("Highest Science Scorer:", highest_science['Name'])
print("Highest English Scorer:", highest_english['Name'])


Highest Math Scorer: Eva
Highest Science Scorer: Charlie
Highest English Scorer: David
