# Data Structures:

In [8]:
# Series
import pandas as pd
import numpy as np
s = pd.Series([1, 3, 5, np.nan, 6, 8])

# DataFrame
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'Paris', 'London']
})
print(df)

      name  age      city
0    Alice   25  New York
1      Bob   30     Paris
2  Charlie   35    London


# Data Loading:

In [4]:
# Read from CSV
df = pd.read_csv('data.csv')

# Read from Excel
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# Read from SQL database
import sqlite3
conn = sqlite3.connect('example.db')
df = pd.read_sql('SELECT * FROM my_table', conn)


FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

# Data Selection:

In [5]:
# Select a single column
df['name']

# Select multiple columns
df[['name', 'age']]

# Select rows based on condition
df[df['age'] > 30]

# Select rows based on multiple conditions
df[(df['age'] > 30) & (df['city'] == 'London')]


Unnamed: 0,name,age,city
2,Charlie,35,London


# Data Manipulation:

In [3]:
# Rename columns
df.rename(columns={'name': 'full_name', 'city': 'location'})

# Drop rows with missing values
df.dropna()

# Fill missing values with a default value
df.fillna(0)

# Group data by a column and calculate the mean of each group
print(df.groupby('city')['age'].mean())


city
London      35.0
New York    25.0
Paris       30.0
Name: age, dtype: float64


# Data Visualization:

In [7]:
# Plot a line chart
df.plot(x='year', y='sales')

# Plot a bar chart
df.plot(kind='bar', x='name', y='salary')

# Plot a scatter plot
df.plot(kind='scatter', x='age', y='income')


KeyError: 'year'

# Data Cleaning:

In [None]:
# Drop duplicates
df.drop_duplicates()

# Fill missing values with the mean of each column
df.fillna(df.mean())

# Replace values in a column
df['category'].replace({'A': 'apple', 'B': 'banana', 'C': 'cherry'})


# Data Transformation:

In [1]:
# Reshape wide to long format
pd.melt(df, id_vars=['id'], value_vars=['var1', 'var2'], var_name='variable', value_name='value')

# Reshape long to wide format
df.pivot(index='date', columns='variable', values='value')

# Stack columns into rows
df.stack()

# Unstack rows into columns
df.unstack()


NameError: name 'pd' is not defined

# Time Series Analysis:

In [24]:
# Convert string to datetime object
df['date'] = pd.to_datetime(pd.date_range("10-05-2023", periods=3, freq="S"))
display(df)
# Set datetime column as index
df.set_index('date', inplace=True)

# Resample data by month
df.resample('M')
display(df)

# Compute rolling mean
# df.rolling(window=30).mean()


Unnamed: 0_level_0,name,age,city,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-10-05 00:00:00,Alice,25,New York,2023-10-05 00:00:00
2023-10-05 00:00:01,Bob,30,Paris,2023-10-05 00:00:01
2023-10-05 00:00:02,Charlie,35,London,2023-10-05 00:00:02


Unnamed: 0_level_0,name,age,city
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-10-05 00:00:00,Alice,25,New York
2023-10-05 00:00:01,Bob,30,Paris
2023-10-05 00:00:02,Charlie,35,London
