In [None]:
import pandas as pd
import plotly.express as px
import re

In [None]:
df = pd.read_csv("CourseraDataset.csv")

# DataSet Info


#### Column Descriptions
* **Course Title**: This column contains the title of the course offered on Coursera.
* **Rating**: The rating column likely contains the average rating of the course, as provided by users who have completed the course. Ratings are often given on a scale, such as 1 to 5 stars.
* **Level**: This column indicates the difficulty or complexity level of the course. It might categorize courses as beginner, intermediate, or advanced, for example.
* **Duration**: This column specifies the approximate time required to complete the course.
* **Schedule**: This column may specify the schedule or timing of the course, such as whether it is flexible schedule or hands-on learning.
* **Review**: This column contains the count of reviews or ratings submitted by users who have completed the course. It provides an indication of the course's popularity and user satisfaction level.
* **What you will learn**: This column likely outlines the learning objectives or topics covered in the course. It provides a summary of the knowledge or skills that participants can expect to gain.
* **Skill gain**: This column may detail the specific skills that participants will acquire upon completion of the course.
* **Modules**: The modules column likely lists the different sections or units that make up the course. It could provide an overview of the course's structure and organization.
* **Instructor**: This column contains information about the instructor(s) or lecturer(s) who teach the course.
* **Offered By**: This column likely specifies the institution or organization offering the course on the Coursera platform.
* **Keyword**: This column may contain keywords or tags associated with the course, which can help users search for relevant courses based on specific topics or themes.
* **Course Url**: This column likely contains the URL or web link to the course page on the Coursera platform.


In [None]:
print(f"Shape: {df.shape}")
print(f"Duplicates: {df.duplicated().sum()}")
print(f"NaN Values: {df.isna().values.any()}")

Shape: (9595, 13)
Duplicates: 900
NaN Values: True


In [None]:
df.loc[800]

Course Title                 Digital business - Act on the digital world
Rating                                                               4.6
Level                                                                NaN
Duration                                        10 hours (approximately)
Schedule                                               Flexible schedule
Review                                                        22 reviews
What you will learn                                                  NaN
Skill gain                                                            []
Modules                ['Big Picture', 'Operational Area', 'Focus Area']
Instructor                          ['Thomas Houy', 'Valérie Fernandez']
Offered By             ['École Polytechnique', 'Institut Mines-Télécom']
Keyword                                                         Business
Course Url             https://www.coursera.org/learn/act-on-digital-...
Name: 800, dtype: object

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9595 entries, 0 to 9594
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Course Title         9595 non-null   object 
 1   Rating               8156 non-null   float64
 2   Level                8330 non-null   object 
 3   Duration             9333 non-null   object 
 4   Schedule             8912 non-null   object 
 5   Review               8152 non-null   object 
 6   What you will learn  4984 non-null   object 
 7   Skill gain           9595 non-null   object 
 8   Modules              9595 non-null   object 
 9   Instructor           9595 non-null   object 
 10  Offered By           9595 non-null   object 
 11  Keyword              9595 non-null   object 
 12  Course Url           9595 non-null   object 
dtypes: float64(1), object(12)
memory usage: 974.6+ KB


# Data Cleaning

### NaN Values Present in the DataFrame



In [None]:
# Creates a DataFrame with the percentage of NaN values for each column
nan_values_pct = ((df.isna().sum() / df.shape[0]) * 100).to_frame().reset_index()
# Rename the columns
nan_values_pct.columns = ["Column Name", "NaN Values PCT"]
# Sorting columns that have NaN values
nan_values_pct = nan_values_pct.sort_values(by="NaN Values PCT")

# Plotting a bar chart with the NaN Values PCT DataFrame
fig = px.bar(nan_values_pct, x="Column Name", y="NaN Values PCT", color_discrete_sequence=["darkred"], title="NaN Values Present in the DataFrame")
fig.show()

### Removing and Filling Columns

In [None]:
# Removing empty or useless columns
df.drop(["Schedule", "What you will learn", "Skill gain"], axis=1, inplace=True)

In [None]:
# Removing duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Filling the NaN values of the "Rating" and "Review" columns with 0
df["Rating"] = df["Rating"].fillna(0)
df["Review"] = df["Review"].fillna(0)

# Filling the NaN values of the "Level" with Undefined
df["Level"] = df["Level"].fillna("Undefined")

In [None]:
# Removing the word "level"
df["Level"] = df["Level"].apply(lambda x: x.replace(" level", ""))

# Removing the word "reviews" and commas
df["Review"] = df["Review"].apply(lambda x: str(x).replace(" reviews", "").replace(",", ""))
# Converting the "Review" column type to int
df["Review"] = df["Review"].astype("int")

In [None]:
# Remove NaN values from "Duration" column
df.dropna(subset=["Duration"], inplace=True)

### Converting the “Duration” Column to Total Hours

In [None]:
# Function to convert time into total hours
def total_hours(time):
    # Pattern for "X months at Y hours a week"
    match = re.match(r"(\d+) months at (\d+) hours a week", time)

    if match:
        months = float(match.group(1))
        hours_per_week = float(match.group(2))

        return months * 4 * hours_per_week

    # Pattern for "Approx. X hours to complete" or "X hours (approximately)"
    match = re.match(r"Approx\. (\d+) hours? to complete|(\d+) hours? \(approximately", time)

    if match:
        return float(match.group(1) if match.group(1) else match.group(2))

    # Pattern for "X hours"
    match = re.match(r"(\d+(?:\.\d+)?) hours?", time)

    if match:
        hours = float(match.group(1))

        return hours

    # Pattern for "X hour and Y minutes"
    match = re.match(r"(\d+) hour and (\d+) minutes", time)

    if match:
        hours = float(match.group(1))
        minutes = float(match.group(2))

        return hours + minutes / 60

    # Pattern for "X week of study, Y hours"
    match = re.match(r"(\d+) week of study, (\d+(?:\.\d+)?) hours", time)

    if match:
        weeks = float(match.group(1))
        hours_per_week = float(match.group(2))

        return weeks * hours_per_week

In [None]:
# Applying the function in "Duration" column
df["Duration"] = df["Duration"].apply(total_hours)

# Drop the useless columns
df.dropna(subset=["Duration"], inplace=True)

# Data Cleaning Result

In [None]:
print(f"Shape: {df.shape}")
print(f"Removed Columns: Schedule, What you will learn, Skill gain")
print(f"Duplicates: {df.duplicated().sum()}")
print(f"NaN Values: {df.isna().values.any()}")

Shape: (8451, 10)
Removed Columns: Schedule, What you will learn, Skill gain
Duplicates: 0
NaN Values: False


In [None]:
df.head()

Unnamed: 0,Course Title,Rating,Level,Duration,Review,Modules,Instructor,Offered By,Keyword,Course Url
0,Fashion as Design,4.8,Beginner,20.0,2813,"['Introduction', 'Heroes', 'Silhouettes', 'Cou...","['Anna Burckhardt', 'Paola Antonelli', 'Michel...",['The Museum of Modern Art'],Arts and Humanities,https://www.coursera.org/learn/fashion-design
1,Modern American Poetry,4.4,Beginner,34.0,100,"['Orientation', 'Module 1', 'Module 2', 'Modul...",['Cary Nelson'],['University of Illinois at Urbana-Champaign'],Arts and Humanities,https://www.coursera.org/learn/modern-american...
2,Pixel Art for Video Games,4.5,Beginner,9.0,227,"['Week 1: Introduction to Pixel Art', 'Week 2:...","['Andrew Dennis', 'Ricardo Guimaraes']",['Michigan State University'],Arts and Humanities,https://www.coursera.org/learn/pixel-art-video...
3,Distribución digital de la música independiente,0.0,Beginner,8.0,0,"['Semana 1', 'Semana 2', 'Semana 3', 'Semana 4']",['Eduardo de la Vara Brown.'],['SAE Institute México'],Arts and Humanities,https://www.coursera.org/learn/distribucion-di...
4,The Blues: Understanding and Performing an Ame...,4.8,Beginner,11.0,582,"['Blues Progressions – Theory and Practice ', ...",['Dariusz Terefenko'],['University of Rochester'],Arts and Humanities,https://www.coursera.org/learn/the-blues


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8451 entries, 0 to 8694
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Course Title  8451 non-null   object 
 1   Rating        8451 non-null   float64
 2   Level         8451 non-null   object 
 3   Duration      8451 non-null   float64
 4   Review        8451 non-null   int64  
 5   Modules       8451 non-null   object 
 6   Instructor    8451 non-null   object 
 7   Offered By    8451 non-null   object 
 8   Keyword       8451 non-null   object 
 9   Course Url    8451 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 726.3+ KB
