In [None]:
# Pandas Profiling | Day 22 | CampusX
# ----------------------------------

# 1. Introduction to Pandas Profiling
# - Pandas Profiling is a powerful Python library for automated exploratory data analysis (EDA).
# - Generates comprehensive profile reports on a dataset with a single line of code.
# - Helps quickly understand dataset structure, quality, distributions, and relations.

# 2. Key Features of a Pandas Profiling Report:
#   - Overview:
#     * Dataset dimensions (rows, columns)
#     * Data types of each column
#     * Memory usage
#     * Missing values count and percentage
#     * Duplicate rows percentage
#   - Variables:
#     * Statistical summary per feature (mean, median, mode, min, max, std deviation, quantiles)
#     * Histograms and distribution plots
#     * Number of distinct values
#     * Missing value visualizations
#   - Correlations:
#     * Pearson, Spearman, Kendall Tau, phi_k, and Cramér's V correlations
#     * Helps find linear and categorical feature associations
#   - Missing Values:
#     * Visual representation such as missing value matrices and heatmaps
#   - Sample:
#     * Displays samples of rows from the dataframe (head and tail)
#   - Warnings:
#     * Alerts on potential data issues like high cardinality, skewed distributions, or constant features

# 3. Installation:
# - Use pip: `pip install pandas-profiling`
# - For newer versions, you may install ydata-profiling as it is the maintained fork:
#   `pip install -U ydata-profiling`

# 4. Usage Example:
#   ```
#   import pandas as pd
#   from pandas_profiling import ProfileReport
# df = pd.read_csv('your_dataset.csv')
#   profile = ProfileReport(df, title="Pandas Profiling Report")
#   profile.to_file("your_report.html")
#   ```
# - Open 'your_report.html' in a web browser to explore the interactive report.

# 5. Practical Benefits:
# - Saves time by automating routine EDA tasks.
# - Quickly identifies data quality issues and informs cleaning steps.
# - Provides visual and statistical information necessary for feature engineering.
# - Detects correlations that help in understanding feature interactions.
# - Helps beginners and experts alike to get a comprehensive dataset overview efficiently.

# 6. Tips for Effective Use:
# - For very large datasets, consider using sample subsets to generate the profile faster.
# - Use report parameters to customize output, such as minimal=True for a lighter report.
# - Always interpret warnings and insights to guide your data preprocessing steps effectively.

# -- End of Notes --


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#installing the library 
!pip install sweetviz



In [5]:
# 2) Load dataset, quick checks, and optional sampling for very large files
import pandas as pd
from IPython.display import IFrame, display, HTML

DATA_PATH = "train.csv"   # change if different

# load
df = pd.read_csv(DATA_PATH)

# basic info
print("Rows, cols:", df.shape)
display(df.head())

# if the dataset is very large, sample to keep the report quick and avoid memory issues
MAX_ROWS_FOR_FULL_REPORT = 200_000   # change if you want
if df.shape[0] > MAX_ROWS_FOR_FULL_REPORT:
    print(f">> Large dataset ({df.shape[0]} rows). Sampling {MAX_ROWS_FOR_FULL_REPORT} rows for profiling.")
    df_sample = df.sample(n=MAX_ROWS_FOR_FULL_REPORT, random_state=42)
else:
    df_sample = df


Rows, cols: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
import sweetviz as sv
import warnings
import numpy as np

# Add this line to handle the missing VisibleDeprecationWarning
# This creates the missing attribute that Sweetviz is looking for
if not hasattr(np, 'VisibleDeprecationWarning'):
    np.VisibleDeprecationWarning = np.VisibleDeprecationWarning = UserWarning

# Suppress related warnings
warnings.filterwarnings('ignore', category=UserWarning)

# Create the report object
# Replace 'df' with your actual DataFrame variable name
report = sv.analyze(df)  

# Save the HTML report to disk
REPORT_FILE = "sweetviz_report.html"
report.show_html(REPORT_FILE, open_browser=False)
print(f"Saved report to {REPORT_FILE}")

                                             |                                             | [  0%]   00:00 ->…

Report sweetviz_report.html was generated.
Saved report to sweetviz_report.html


In [7]:
# 4) Embed the saved HTML report inside the notebook (iframe)
# (Adjust height if needed)
display(HTML(f'<h3>Sweetviz report</h3>'))
display(IFrame(src=REPORT_FILE, width="100%", height=900))


In [8]:
import pandas as pd
# Example: Load your dataframes
df_train = pd.read_csv('train.csv')  # Replace with your actual data loading
df_test = pd.read_csv('train.csv')    # Replace with your actual data loading

# Define TARGET_COLUMN if it's not already defined
TARGET_COLUMN = 'Survived'  # Replace with your actual target column name

# Now compare the dataframes
import sweetviz as sv
from IPython.display import IFrame, display

report_compare = sv.compare([df_train, "Train"], [df_test, "Test"], target_feat=TARGET_COLUMN if TARGET_COLUMN in df_train.columns else None)
report_compare.show_html("sweetviz_compare.html", open_browser=False)
display(IFrame("sweetviz_compare.html", width="100%", height=900))

                                             |                                             | [  0%]   00:00 ->…

Report sweetviz_compare.html was generated.


In [9]:
import pandas as pd
# Example: Load your dataframes
df_train = pd.read_csv('train.csv')  # Replace with your actual data loading
df_test = pd.read_csv('train.csv')    # Replace with your actual data loading

# Define TARGET_COLUMN if it's not already defined
TARGET_COLUMN = 'Fare'  # Replace with your actual target column name

# Now compare the dataframes
import sweetviz as sv
from IPython.display import IFrame, display

report_compare = sv.compare([df_train, "Train"], [df_test, "Test"], target_feat=TARGET_COLUMN if TARGET_COLUMN in df_train.columns else None)
report_compare.show_html("sweetviz_compare.html", open_browser=False)
display(IFrame("sweetviz_compare.html", width="100%", height=900))

                                             |                                             | [  0%]   00:00 ->…

Report sweetviz_compare.html was generated.


In [10]:
%pip install dtale

Note: you may need to restart the kernel to use updated packages.


In [11]:
import dtale
import pandas as pd

df = pd.read_csv("train.csv")
d = dtale.show(df)
d.open_browser()  # Opens interactive UI in browser





This means that static image generation (e.g. `fig.write_image()`) will not work.

Please upgrade Plotly to version 6.1.1 or greater, or downgrade Kaleido to version 0.2.1.




In [12]:
%pip install autoviz

Note: you may need to restart the kernel to use updated packages.


In [13]:
!pip install nltk
import nltk
nltk.download('words')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')



[nltk_data] Downloading package words to C:\Users\milind
[nltk_data]     chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\milind
[nltk_data]     chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\milind
[nltk_data]     chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\milind
[nltk_data]     chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\milind
[nltk_data]     chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\milind chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is al

True

In [14]:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()

df = AV.AutoViz("train.csv")  # pass CSV file path

Imported v0.1.905. Please call AutoViz in this sequence:
    AV = AutoViz_Class()
    %matplotlib inline
    dfte = AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=1, lowess=False,
               chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)
Shape of your Data Set loaded: (891, 12)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  2
    Number of Integer-Categorical Columns =  3
    Number of String-Categorical Columns =  1
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  1
    Number of Numeric-Boolean Columns =  1
    Number of Discrete String Columns =  2
    Number of NLP String Columns =  1
    Number of 

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
PassengerId,int64,0.0,100.0,1.0,891.0,Possible ID column: drop before modeling step.
Survived,int64,0.0,0.0,0.0,1.0,No issue
Pclass,int64,0.0,0.0,1.0,3.0,No issue
Name,object,0.0,100.0,,,No issue
Sex,object,0.0,0.0,,,No issue
Age,float64,19.86532,,0.42,80.0,"177 missing values. Impute them with mean, median, mode, or a constant value such as 123., Column has 11 outliers greater than upper bound (64.81) or lower than lower bound(-6.69). Cap them or remove them."
SibSp,int64,0.0,0.0,0.0,8.0,Column has 46 outliers greater than upper bound (2.50) or lower than lower bound(-1.50). Cap them or remove them.
Parch,int64,0.0,0.0,0.0,6.0,Column has 213 outliers greater than upper bound (0.00) or lower than lower bound(0.00). Cap them or remove them.
Ticket,object,0.0,76.0,,,Possible high cardinality column with 681 unique values: Use hash encoding or text embedding to reduce dimension.
Fare,float64,0.0,,0.0,512.3292,Column has 116 outliers greater than upper bound (65.63) or lower than lower bound(-26.72). Cap them or remove them.


Number of All Scatter Plots = 3


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to C:\Users\milind
[nltk_data]    |     chavan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to C:\Users\milind
[nltk_data]    |     chavan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to C:\Users\milind
[nltk_data]    |     chavan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to C:\Users\milind
[nltk_data]    |     chavan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to C:\Users\milind
[nltk_data]    |     chavan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading

All Plots done
Time to run AutoViz = 28 seconds 

 ###################### AUTO VISUALIZATION Completed ########################


In [15]:
!pip install lux-api



In [17]:
%pip install lux-api lux-widget --quiet

Note: you may need to restart the kernel to use updated packages.


In [21]:
import pandas as pd
import lux
from IPython.display import display

print("lux version:", lux.__version__)

df = pd.read_csv("train.csv")
# set lux to show recommendations by default
lux.config.default_display = "widget"  

# show the DataFrame (this should render the widget with recommendations)
display(df)

lux version: 0.5.1


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
%pip install dataprep

In [None]:
from dataprep.eda import create_report
import pandas as pd

df = pd.read_csv("train.csv")
create_report(df).show_browser()