In [1]:
# Cell 1: Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

# Settings to make things look nice
pd.set_option('display.max_columns', None) # Show all columns
pd.set_option('display.max_rows', 100)     # Show 100 rows
warnings.filterwarnings('ignore')          # Hide messy warnings

# Set plot style
sns.set(style="whitegrid")

print("✅ Libraries Loaded and Ready!")

✅ Libraries Loaded and Ready!


In [2]:
# Cell 2: Data Inspection Function
def inspect_data(df):
    """
    Returns a quick summary of the dataset including:
    - Shape (rows, cols)
    - Missing values
    - Duplicates
    - Data types
    """
    print("------- DATA SHAPE -------")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    
    print("\n------- MISSING VALUES -------")
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if missing.empty:
        print("No missing values found.")
    else:
        print(missing.sort_values(ascending=False))
        
    print("\n------- DUPLICATES -------")
    dups = df.duplicated().sum()
    print(f"Duplicate Rows: {dups}")
    
    print("\n------- DATA TYPES -------")
    print(df.dtypes)
    
    print("\n------- FIRST 3 ROWS -------")
    display(df.head(3)) # using display() for nicer formatting in Jupyter

print("✅ Helper Function 'inspect_data(df)' created!")

✅ Helper Function 'inspect_data(df)' created!


In [16]:
# Try loading the Titanic data
try:
    df_titanic = pd.read_csv('archive/titanic.csv')
    print("Titanic Loaded Successfully!")
    inspect_data(df_titanic) # Use your helper function!
except Exception as e:
    print(f"Error: {e}")

Titanic Loaded Successfully!
------- DATA SHAPE -------
Rows: 891, Columns: 12

------- MISSING VALUES -------
Cabin       687
Age         177
Embarked      2
dtype: int64

------- DUPLICATES -------
Duplicate Rows: 0

------- DATA TYPES -------
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

------- FIRST 3 ROWS -------


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [9]:
# Try loading the Superstore data
try:
    # This might fail with default encoding
    df_store = pd.read_csv('archive/superstore.csv', encoding='latin1') 
    print("Superstore Loaded Successfully with Latin1!")
    inspect_data(df_store) # Use your helper function!
except Exception as e:
    print(f"Error: {e}")

Superstore Loaded Successfully with Latin1!
------- DATA SHAPE -------
Rows: 51290, Columns: 27

------- MISSING VALUES -------
No missing values found.

------- DUPLICATES -------
Duplicate Rows: 0

------- DATA TYPES -------
Category           object
City               object
Country            object
Customer.ID        object
Customer.Name      object
Discount          float64
Market             object
è®°å½æ°           int64
Order.Date         object
Order.ID           object
Order.Priority     object
Product.ID         object
Product.Name       object
Profit            float64
Quantity            int64
Region             object
Row.ID              int64
Sales               int64
Segment            object
Ship.Date          object
Ship.Mode          object
Shipping.Cost     float64
State              object
Sub.Category       object
Year                int64
Market2            object
weeknum             int64
dtype: object

------- FIRST 3 ROWS -------


Unnamed: 0,Category,City,Country,Customer.ID,Customer.Name,Discount,Market,è®°å½æ°,Order.Date,Order.ID,Order.Priority,Product.ID,Product.Name,Profit,Quantity,Region,Row.ID,Sales,Segment,Ship.Date,Ship.Mode,Shipping.Cost,State,Sub.Category,Year,Market2,weeknum
0,Office Supplies,Los Angeles,United States,LS-172304,Lycoris Saunders,0.0,US,1,2011-01-07 00:00:00.000,CA-2011-130813,High,OFF-PA-10002005,Xerox 225,9.3312,3,West,36624,19,Consumer,2011-01-09 00:00:00.000,Second Class,4.37,California,Paper,2011,North America,2
1,Office Supplies,Los Angeles,United States,MV-174854,Mark Van Huff,0.0,US,1,2011-01-21 00:00:00.000,CA-2011-148614,Medium,OFF-PA-10002893,"Wirebound Service Call Books, 5 1/2"" x 4""",9.2928,2,West,37033,19,Consumer,2011-01-26 00:00:00.000,Standard Class,0.94,California,Paper,2011,North America,4
2,Office Supplies,Los Angeles,United States,CS-121304,Chad Sievert,0.0,US,1,2011-08-05 00:00:00.000,CA-2011-118962,Medium,OFF-PA-10000659,"Adams Phone Message Book, Professional, 400 Me...",9.8418,3,West,31468,21,Consumer,2011-08-09 00:00:00.000,Standard Class,1.81,California,Paper,2011,North America,32


In [14]:
# Try loading the Superstore data
try:
    # This might fail with default encoding
    df_store = pd.read_csv('archive/bank.csv', sep=';') 
    print("Bank Loaded Successfully!")
    inspect_data(df_store)
except Exception as e:
    print(f"Error: {e}")

Bank Loaded Successfully!
------- DATA SHAPE -------
Rows: 11162, Columns: 1

------- MISSING VALUES -------
No missing values found.

------- DUPLICATES -------
Duplicate Rows: 0

------- DATA TYPES -------
age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit    object
dtype: object

------- FIRST 3 ROWS -------


Unnamed: 0,"age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit"
0,"59,admin.,married,secondary,no,2343,yes,no,unk..."
1,"56,admin.,married,secondary,no,45,no,no,unknow..."
2,"41,technician,married,secondary,no,1270,yes,no..."
