# SQL Project
You were hired by Ironhack to perform an Analytics Consulting Project entitled: competitive landscape.

Your mission is to create and populate an appropriate database with many coding schools that are our competition, as well as design an suitable queries that answer business questions of interest (to be defined by you)


In [None]:
# !pip install mysql-connector-python

# Libraries & generic functions

In [1]:
#Libraries

import re

import pandas as pd
from pandas import json_normalize

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_row', None)
# pd.reset_option("display.max_rows")
# pd.reset_option("display.max_columns")

import requests

import getpass
import mysql.connector

from datetime import datetime

In [2]:
#Functions to remove the HTML tags after scrapping

def remove_tags(x):
    return re.compile(r'<[^>]+>').sub('',x)

In [3]:
#Function to formate MM/DD/YYYY into q MySQL readeable : YYYY-MM-DD

def change_day_month_year (date):

    date_str_to_date = datetime.strptime(date, "%m/%d/%Y")
    date_to_date_sql = date_str_to_date.strftime("%Y-%m-%d")
    
    return date_to_date_sql

In [4]:
#Programs for Ironhack weren't well formatted. This function is used to standardize them

def program_cleaner(program):
    if program == 'Full-time Web Development Bootcamp':
        program = 'Web Development Bootcamp'
    elif program == 'Full-time UX/UI Design Bootcamp':
        program = 'UX/UI Design Bootcamp'
    elif program == 'Part-time UX/UI Design':
        program = 'UX/UI Design Part-Time'
    elif program == 'Part-time Web Development':
        program = 'Web Development Part-Time'
    if program == 'Full-time Web Development Bootcamp ':
        program = 'Web Development Bootcamp'
    elif program == 'Part-time UX/UI Design ':
        program = 'UX/UI Design Part-Time'
    else:
        program = program
        
    return program

URL to visualize scrapping

https://www.switchup.org/chimera/v1/school-review-list?mainTemplate=school-review-list&path=%2Fbootcamps%2Fironhack&isDataTarget=false&page=3&perPage=10000&simpleHtml=true&truncationLength=250

https://www.switchup.org/chimera/v1/bootcamp-data?mainTemplate=bootcamp-data%2Fdescription&path=%2Fbootcamps%2Fironhack&isDataTarget=false&bootcampId=10828&logoTag=logo&truncationLength=250&readMoreOmission=...&readMoreText=Read%20More&readLessText=Read%20Less

In [5]:
#Getting the comments of a school

def get_comments_school(school,id):
    
    #Url to get data 
    url = "https://www.switchup.org/chimera/v1/school-review-list?mainTemplate=school-review-list&path=%2Fbootcamps%2F" + school + "&isDataTarget=false&page=3&perPage=10000&simpleHtml=true&truncationLength=250"
    
    #Data into a dataframe
    data = requests.get(url).json()
    reviews =  pd.DataFrame(data['content']['reviews'])
  
    #Function to apply regex and remove tags
    reviews['review_body'] = reviews['body'].apply(remove_tags)
    
    #Adding usefull features the dataframe
    reviews['school'] = school
    reviews['school_id'] = id
    return reviews

# Comments dataframe

In [6]:
#Schools dictionary 
#Manually scrapped after reading HTML. The ones with the most comments + the ones we were interested in
#Another way would have been to scrap the entire website to get school name and school id. But we will see later that from these school we already a lot of data to analyze

schools = {   
'ironhack' : 10828,
'app-academy' : 10525,
#'springboard' : 11035,
'le-wagon' : 10868,
#'udacity' : 11118,
'shecodes' : 11014,
'app-academy' : 10525,
'designlab' : 10697,
'nucamp' : 10923,
'thinkful' : 11098,
#'software-development-academy' : 11030,
'coding-dojo' : 10659,
'makers-academy' : 10874,
'product-gym' : 10959,
}

In [7]:
#Scrapping all the school we want to look at 

comments = [get_comments_school(school, id) for school,id in schools.items()]

In [8]:
#Putting the data in a dataframe

comments = pd.concat(comments).reset_index()
comments.drop(columns=['index'], inplace = True)

comments.head(2)

Unnamed: 0,id,name,anonymous,hostProgramName,graduatingYear,isAlumni,jobTitle,tagline,body,rawBody,createdAt,queryDate,program,user,overallScore,comments,overall,curriculum,jobSupport,review_body,school,school_id
0,306372,Sergio Burgos,False,,2023.0,False,International Negotiator,The Most Intense Academic Challenge,"<span class=""truncatable""><p></p><p>After comp...",<p>After completing my Data Analytics Bootcamp...,11/10/2023,2023-11-10,Data Analytics Bootcamp,{'image': None},3.3,[],3.0,3.0,4.0,After completing my Data Analytics Bootcamp wi...,ironhack,10828
1,306215,Anonymous,True,,2023.0,True,,Transformative Experience: My Time at Ironhack,"<span class=""truncatable""><p></p><p>Pros: 1)In...",<p>Pros: 1)Intensive Learning 2)Real-World Pro...,11/6/2023,2023-11-06,Web Development Bootcamp,{'image': None},4.0,[],4.0,4.0,4.0,Pros: 1)Intensive Learning 2)Real-World Projec...,ironhack,10828


In [9]:
#Having a look at how our data are

comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10357 entries, 0 to 10356
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               10357 non-null  int64  
 1   name             10357 non-null  object 
 2   anonymous        10357 non-null  bool   
 3   hostProgramName  4175 non-null   object 
 4   graduatingYear   10305 non-null  float64
 5   isAlumni         10343 non-null  object 
 6   jobTitle         7212 non-null   object 
 7   tagline          10355 non-null  object 
 8   body             10357 non-null  object 
 9   rawBody          10357 non-null  object 
 10  createdAt        10357 non-null  object 
 11  queryDate        10357 non-null  object 
 12  program          9494 non-null   object 
 13  user             10357 non-null  object 
 14  overallScore     10335 non-null  object 
 15  comments         10357 non-null  object 
 16  overall          10333 non-null  object 
 17  curriculum  

## Cleaning the dataframe and changing the type of columns to suits the SQL database and the analysis

In [9]:
comments.drop(columns=['body', 'rawBody', 'user', 'comments'], inplace = True)

comments = comments.dropna(subset = ['graduatingYear']).copy() 

comments['graduatingYear'] = comments['graduatingYear'].apply(lambda x : int(x))

comments['isAlumni'] = comments['isAlumni'].apply(lambda x : bool(x))

comments['createdAt'] = comments['createdAt'].apply(change_day_month_year)

comments['overallScore'] = comments['overallScore'].apply(lambda x: float(x) if not pd.isna(x) else 0)

comments['overall'] = comments['overall'].apply(lambda x: float(x) if not pd.isna(x) else 0)

comments['curriculum'] = comments['curriculum'].apply(lambda x: float(x) if not pd.isna(x) else 0)

comments['jobSupport'] = comments['jobSupport'].apply(lambda x: float(x) if not pd.isna(x) else 0)

comments.drop(comments[comments['graduatingYear'] < 2015].index, inplace = True)

In [None]:
comments.info()

In [None]:
comments.head(3)

## Creating a comments dataframe filtered on Ironhack and with extra cleaning

In [32]:
comments_ironhack = comments[comments['school'] == 'ironhack'].copy()
programs = list(comments_ironhack['program'].unique())
programs

['Data Analytics Bootcamp',
 'Web Development Bootcamp',
 None,
 'UX/UI Design Bootcamp',
 'Web Development Part-Time',
 'Data Analytics Part-Time',
 'UX/UI Design Part-Time',
 'Cybersecurity Part-Time',
 'Cyber Security Bootcamp',
 '',
 'Full-time Web Development Bootcamp',
 'Full-time UX/UI Design Bootcamp',
 'Part-time Web Development',
 'Part-time UX/UI Design',
 'Full-time Web Development Bootcamp ',
 'Part-time UX/UI Design ',
 'Web Design']

In [33]:
comments_ironhack['program'] = comments_ironhack['program'].apply(program_cleaner)
comments_ironhack.drop(comments_ironhack[comments_ironhack['program'] == 'Web Design'].index, inplace = True)
comments_ironhack.drop(comments_ironhack[comments_ironhack['program'] == ''].index, inplace = True)
comments_ironhack.head(3)

Unnamed: 0,id,name,anonymous,hostProgramName,graduatingYear,isAlumni,jobTitle,tagline,createdAt,queryDate,program,overallScore,overall,curriculum,jobSupport,review_body,school,school_id
0,306372,Sergio Burgos,False,,2023,False,International Negotiator,The Most Intense Academic Challenge,2023-11-10,2023-11-10,Data Analytics Bootcamp,3.3,3.0,3.0,4.0,After completing my Data Analytics Bootcamp wi...,ironhack,10828
1,306215,Anonymous,True,,2023,True,,Transformative Experience: My Time at Ironhack,2023-11-06,2023-11-06,Web Development Bootcamp,4.0,4.0,4.0,4.0,Pros: 1)Intensive Learning 2)Real-World Projec...,ironhack,10828
2,306068,Anonymous,True,,2023,False,Full stack development,Now I can do it,2023-10-31,2023-10-31,,5.0,5.0,5.0,5.0,"7 months ago, I only had an idea about html an...",ironhack,10828
3,305297,Utku Cikmaz,False,,2023,False,Full Stack Web Developer,It was good,2023-10-02,2023-10-02,Web Development Bootcamp,4.0,5.0,3.0,4.0,"The course was great. Especially, Luis is a gr...",ironhack,10828
4,305278,Nirmal Hodge,False,,2023,False,Product Designer,Ironhack 100% Worth It!,2023-09-30,2023-09-30,UX/UI Design Bootcamp,5.0,5.0,5.0,5.0,I joined the UX/ UI Bootcamp and to be honest ...,ironhack,10828


In [34]:
comments_ironhack['program'].value_counts()

program
Web Development Bootcamp     608
UX/UI Design Bootcamp        270
Web Development Part-Time    143
Data Analytics Bootcamp       58
UX/UI Design Part-Time        37
Data Analytics Part-Time      23
Cyber Security Bootcamp        6
Cybersecurity Part-Time        3
Name: count, dtype: int64

## Grouping on school and year to visualize the mean score

In [27]:
comments.groupby(['school', 'graduatingYear'])['overallScore'].agg('mean').reset_index()

Unnamed: 0,school,graduatingYear,overallScore
0,app-academy,2015,4.276471
1,app-academy,2016,4.812903
2,app-academy,2017,4.631897
3,app-academy,2018,4.671921
4,app-academy,2019,4.531982
...,...,...,...
78,thinkful,2018,4.608621
79,thinkful,2019,4.347059
80,thinkful,2020,3.760000
81,thinkful,2021,3.814286


In [28]:
comments.groupby(['school'])['overallScore'].agg('mean').reset_index()

Unnamed: 0,school,overallScore
0,app-academy,4.58887
1,coding-dojo,4.425277
2,designlab,4.577189
3,ironhack,4.717412
4,le-wagon,4.915251
5,makers-academy,4.614655
6,nucamp,4.47274
7,product-gym,4.9394
8,shecodes,4.884577
9,thinkful,4.541627


## Filtering on Ironhack and grouping on graduation year and year AND program to visualize all the scores

In [40]:
ratings_ironhack = comments_ironhack.groupby('graduatingYear')[['overallScore', 'overall', 'curriculum', 'jobSupport']].agg('mean').reset_index()
ratings_ironhack

Unnamed: 0,graduatingYear,overallScore,overall,curriculum,jobSupport
0,2015,4.24375,4.3125,2.65625,2.75
1,2016,4.854286,4.871429,3.7,3.7
2,2017,4.867895,4.910526,4.726316,4.742105
3,2018,4.869935,4.905229,4.787582,4.79085
4,2019,4.765285,4.80829,4.694301,4.772021
5,2020,4.730061,4.797546,4.760736,4.564417
6,2021,4.526744,4.639535,4.569767,4.360465
7,2022,4.395455,4.509091,4.463636,4.163636
8,2023,4.159615,4.25,4.211538,3.903846


In [39]:
ratings_ironhack = comments_ironhack.groupby('program')[['overallScore', 'overall', 'curriculum', 'jobSupport']].agg('mean').reset_index()
ratings_ironhack.sort_values(by = 'overallScore', ascending = False)

Unnamed: 0,program,overallScore,overall,curriculum,jobSupport
6,Web Development Bootcamp,4.821546,4.865132,4.773026,4.731908
4,UX/UI Design Bootcamp,4.672593,4.744444,4.607407,4.503704
5,UX/UI Design Part-Time,4.67027,4.702703,4.648649,4.513514
7,Web Development Part-Time,4.605594,4.671329,3.72028,3.699301
2,Data Analytics Bootcamp,4.443103,4.534483,4.431034,4.362069
3,Data Analytics Part-Time,4.308696,4.434783,4.347826,3.956522
0,Cyber Security Bootcamp,4.283333,4.166667,4.333333,4.333333
1,Cybersecurity Part-Time,3.866667,4.0,3.666667,4.0


# Others dataframes

In [12]:
def get_school_info(school, school_id):
    
    #Getting data from the url
    url = 'https://www.switchup.org/chimera/v1/bootcamp-data?mainTemplate=bootcamp-data%2Fdescription&path=%2Fbootcamps%2F'+ str(school) + '&isDataTarget=false&bootcampId='+ str(school_id) + '&logoTag=logo&truncationLength=250&readMoreOmission=...&readMoreText=Read%20More&readLessText=Read%20Less'
    data = requests.get(url).json()

    #Getting the courses a school provide. Ex: data, web dev ...
    courses = data['content']['courses']
    courses_df = pd.DataFrame(courses, columns= ['courses'])

    #Getting informaton about where the school gives class
    locations = data['content']['locations']
    locations_df = json_normalize(locations)

    #Getting more details about the course
    badges_df = pd.DataFrame(data['content']['meritBadges'])
    
    #Getting informaton about the school
    website = data['content']['webaddr']
    description = data['content']['description']
    logoUrl = data['content']['logoUrl']
    price_min = data['content']['priceMin']
    price_max = data['content']['priceMax']
    school_df = pd.DataFrame([website,description,logoUrl, price_min, price_max]).T
    school_df.columns =  ['website','description','LogoUrl', 'price_min', 'price_max']

    #Adding the name of the school as a feature
    locations_df['school'] = school
    courses_df['school'] = school
    badges_df['school'] = school
    school_df['school'] = school
    
    #Adding the id of the school as a feature
    locations_df['school_id'] = school_id
    courses_df['school_id'] = school_id
    badges_df['school_id'] = school_id
    school_df['school_id'] = school_id

    return locations_df, courses_df, badges_df, school_df

In [13]:
#Crations of dataframes :
#The 4 list below are lists of dataframes. Each element of a list is a dataframe for a school

locations_list = []
courses_list = []
badges_list = []
schools_list = []

for school, id in schools.items():
    print(school)
    a,b,c,d = get_school_info(school,id)
    
    locations_list.append(a)
    courses_list.append(b)
    badges_list.append(c)
    schools_list.append(d)

ironhack


app-academy
le-wagon
shecodes
designlab
nucamp
thinkful
coding-dojo
makers-academy
product-gym


## Finalization of creation and cleaning locations

In [14]:
locations = pd.concat(locations_list).reset_index()
locations.drop(columns=['index', 'state.id', 'state.name', 'state.abbrev', 'state.keyword', 'description', 'country.abbrev', 'city.keyword'], inplace = True)
locations = locations.dropna(subset = ['country.id', 'country.name', 'city.id', 'city.name']).copy() 
locations.rename(columns = {'country.id':'country_id', 'country.name':'country_name', 'city.id':'city_id', 'city.name':'city_name'}, inplace = True)
locations.head()

Unnamed: 0,id,country_id,country_name,city_id,city_name,school,school_id
0,15901,57.0,Germany,31156.0,Berlin,ironhack,10828
1,16022,29.0,Mexico,31175.0,Mexico City,ironhack,10828
2,16086,59.0,Netherlands,31168.0,Amsterdam,ironhack,10828
3,16088,42.0,Brazil,31121.0,Sao Paulo,ironhack,10828
4,16109,38.0,France,31136.0,Paris,ironhack,10828


## Finalization of creation and cleaning courses

In [15]:
courses = pd.concat(courses_list).reset_index()
courses.drop(columns=['index'], inplace = True)
courses.head()

Unnamed: 0,courses,school,school_id
0,Cyber Security Bootcamp,ironhack,10828
1,Cybersecurity Part-Time,ironhack,10828
2,Data Analytics Bootcamp,ironhack,10828
3,Data Analytics Part-Time,ironhack,10828
4,UX/UI Design Bootcamp,ironhack,10828


## Finalization of creation and cleaning badges

In [16]:
badges = pd.concat(badges_list).reset_index()
badges.drop(columns=['index'], inplace = True)
badges['description'] = badges['description'].apply(remove_tags)
badges.head()

Unnamed: 0,name,keyword,description,school,school_id
0,Available Online,available_online,School offers fully online courses,ironhack,10828
1,Verified Outcomes,verified_outcomes,School publishes a third-party verified outcom...,ironhack,10828
2,Flexible Classes,flexible_classes,School offers part-time and evening classes,ironhack,10828
3,Available Online,available_online,School offers fully online courses,app-academy,10525
4,Flexible Classes,flexible_classes,School offers part-time and evening classes,app-academy,10525


## Finalization of creation and cleaning schools

In [17]:
schools = pd.concat(schools_list).reset_index()
schools.drop(columns=['index', 'LogoUrl'], inplace = True)
schools['description'] = schools['description'].apply(remove_tags)
schools.head()

Unnamed: 0,website,description,price_min,price_max,school,school_id
0,www.ironhack.com/en,Ironhack is a global tech school with 9 campus...,7500.0,13000.0,ironhack,10828
1,appacademy.io,"Founded in 2012, App Academy is a world-renown...",0.0,22000.0,app-academy,10525
2,www.lewagon.com,Le Wagon is a global leader in immersive tech ...,,,le-wagon,10868
3,shecodes.io,SheCodes is a coding school that offers online...,99.0,1990.0,shecodes,11014
4,designlab.com,Designlab teaches in-demand UX/UI design skill...,399.0,7749.0,designlab,10697


# Importing DataFrames into SQL

In [18]:
saved_password = getpass.getpass()

In [23]:
#Setting up the connection

connection = mysql.connector.connect(user='root', password=saved_password, database='SQL_project', port=3306)
print(connection.is_connected())
cursor = connection.cursor(buffered=True)

#Inserting rows into SQL tables

schools_cols = ",".join([str(i) for i in schools.columns.tolist()])
locations_cols = ",".join([str(i) for i in locations.columns.tolist()])
courses_cols = ",".join([str(i) for i in courses.columns.tolist()])
badges_cols = ",".join([str(i) for i in badges.columns.tolist()])
comments_cols = ",".join([str(i) for i in comments.columns.tolist()])

True


In [24]:
#Creating and executing the queries

for i,row in schools.iterrows():
    sql = "INSERT INTO schools (" + schools_cols + ") VALUES (" + "%s,"*(len(row)-1) + "%s)"
    cursor.execute(sql, tuple(row))

for i,row in locations.iterrows():
    sql = "INSERT INTO locations (" + locations_cols + ") VALUES (" + "%s,"*(len(row)-1) + "%s)"
    cursor.execute(sql, tuple(row))
    
for i,row in courses.iterrows():
    sql = "INSERT INTO courses (" + courses_cols + ") VALUES (" + "%s,"*(len(row)-1) + "%s)"
    cursor.execute(sql, tuple(row))
    
for i,row in badges.iterrows():
    sql = "INSERT INTO badges (" + badges_cols + ") VALUES (" + "%s,"*(len(row)-1) + "%s)"
    cursor.execute(sql, tuple(row))

In [25]:
for i, row in comments.iterrows():
    row = [value if pd.notna(value) else None for value in row]
    sql = "INSERT INTO comments (" + comments_cols + ") VALUES (" + ",".join(["%s"] * len(row)) + ")"
    cursor.execute(sql, tuple(row))

In [26]:
#Saving and closing

connection.commit()
cursor.close()
connection.close()

connection.is_connected()

False