In [42]:
# Setup the notebook

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import chart_studio.plotly as py

# Line Charts 
## Visualize trends over time
---

In [24]:
# Path of the file to read
spotify_filepath = "data/spotify.csv"

# Read the file into a variable spotify_data
spotify_data = pd.read_csv(spotify_filepath, index_col="Date", parse_dates=True)

In [25]:
# Print the first 5 rows of the data
spotify_data.head()

Unnamed: 0_level_0,Shape of You,Despacito,Something Just Like This,HUMBLE.,Unforgettable
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-06,12287078,,,,
2017-01-07,13190270,,,,
2017-01-08,13099919,,,,
2017-01-09,14506351,,,,
2017-01-10,14275628,,,,


## Plot the data
Now that the dataset is loaded into the notebook, we need only one line of code to make a line chart!

In [26]:
# Line chart showing daily global streams of each song 
fig = go.Figure()

# Add traces

fig.add_trace(go.Line(
    x = spotify_data.index, 
    y = spotify_data["Shape of You"], 
    name = "Shape of You"))

fig.add_trace(go.Line(
    x = spotify_data.index, 
    y = spotify_data["Despacito"], 
    name = "Despacito"))

fig.add_trace(go.Line(
    x = spotify_data.index, 
    y = spotify_data["Something Just Like This"], 
    name = "Something Just Like This"))

fig.add_trace(go.Line(
    x = spotify_data.index, 
    y = spotify_data["HUMBLE."], 
    name = "HUMBLE."))

fig.add_trace(go.Line(
    x = spotify_data.index, 
    y = spotify_data["Unforgettable"], 
    name = "Unforgettable"))

fig.update_layout(paper_bgcolor="whitesmoke")
#plotly.graph_objs.layout.shape.Line(width=800, height=400)

fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




## As we can see there's a lot of code, and the plot doesn't have any title or axe label. I we have our data in a different format we can handle the plot parameters in a easy way

In [27]:
# Path of the file to read
spotify_data_filepath = "data/spotify_edit.csv"

# Read the file into a variable spotify_data
spotify_data_edit = pd.read_csv(spotify_data_filepath, parse_dates=True)

In [28]:
spotify_data_edit.head(5)

Unnamed: 0,Song,Date,Times Streamed
0,Shape of You,06/01/17,12287078
1,Shape of You,07/01/17,13190270
2,Shape of You,08/01/17,13099919
3,Shape of You,09/01/17,14506351
4,Shape of You,10/01/17,14275628


In [29]:
fig2 = px.line(spotify_data_edit,
                  x="Date",
                  y="Times Streamed",
                  color = 'Song',
                  title = "Spotify most streamed songs [2017]"
                 )

fig2.show()

# Bar Charts & Heatmaps
## Use color or length to compare categories
---

## Load the data

In [30]:
# Path of the file to read
flight_filepath = "data/flight_delays.csv"

# Read the file into a variable flight_data
flight_data = pd.read_csv(flight_filepath, index_col="Month")

# Print the data
flight_data

Unnamed: 0_level_0,AA,AS,B6,DL,EV,F9,HA,MQ,NK,OO,UA,US,VX,WN
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,6.955843,-0.320888,7.347281,-2.043847,8.537497,18.357238,3.51264,18.164974,11.398054,10.889894,6.352729,3.107457,1.420702,3.389466
2,7.530204,-0.782923,18.657673,5.614745,10.417236,27.424179,6.029967,21.301627,16.474466,9.588895,7.260662,7.114455,7.78441,3.501363
3,6.693587,-0.544731,10.741317,2.077965,6.730101,20.074855,3.468383,11.018418,10.039118,3.181693,4.892212,3.330787,5.348207,3.263341
4,4.931778,-3.009003,2.780105,0.083343,4.821253,12.64044,0.011022,5.131228,8.766224,3.223796,4.376092,2.66029,0.995507,2.996399
5,5.173878,-1.716398,-0.709019,0.149333,7.72429,13.007554,0.826426,5.46679,22.397347,4.141162,6.827695,0.681605,7.102021,5.680777
6,8.191017,-0.220621,5.047155,4.419594,13.952793,19.712951,0.882786,9.639323,35.561501,8.338477,16.932663,5.766296,5.779415,10.743462
7,3.87044,0.377408,5.841454,1.204862,6.926421,14.464543,2.001586,3.980289,14.352382,6.790333,10.262551,,7.135773,10.504942
8,3.193907,2.503899,9.28095,0.653114,5.154422,9.175737,7.448029,1.896565,20.519018,5.606689,5.014041,,5.106221,5.532108
9,-1.432732,-1.8138,3.539154,-3.703377,0.851062,0.97846,3.696915,-2.167268,8.000101,1.530896,-1.794265,,0.070998,-1.33626
10,-0.58093,-2.993617,3.676787,-5.011516,2.30376,0.082127,0.467074,-3.735054,6.810736,1.750897,-2.456542,,2.254278,-0.688851


## Bar Chart
---

In [31]:
fig3 = px.bar(
    flight_data, 
    x=flight_data.index, 
    y=flight_data['NK'],
    title = "Arrival delay (in minutes)")

fig3.show()

In [32]:
fig4 = px.bar(
    flight_data, 
    x=flight_data.index, 
    y=flight_data['NK'],
    title = "Arrival delay (in minutes)",
    color = flight_data['NK'])

fig4.show()

## Heatmap

---

In [33]:
fig5 = px.imshow(
    flight_data,
    labels = dict(x = "Airline", y = "Months", color = "Arrival delay (minutes)"),
    title = "Average Arrival Delay for Each Airline, by Month",
    color_continuous_scale = "Viridis")

#Other possible values to color_continuous_scale: "RdBu_r", "Inferno", ["blue","red"], "Viridis" 
#  -> More detalis in https://plotly.com/python/colorscales/

fig5.show()

# Scatter Plot
Explore relationships between variables

---

## Load the data
We'll work with a (synthetic) dataset of insurance charges, to see if we can understand why some customers pay more than others.

In [34]:
# Path of the file to read
insurance_filepath = "data/insurance.csv"

# Read the file into a variable insurance_data
insurance_data = pd.read_csv(insurance_filepath)
 
# Print the data
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [35]:
fig6 = px.scatter(insurance_data, x=insurance_data["bmi"], y=insurance_data["charges"])
fig6.show()

The scatterplot above suggests that [body mass index](https://en.wikipedia.org/wiki/Body_mass_index) (BMI) and insurance charges are **positively correlated**, where customers with higher BMI typically also tend to pay more in insurance costs. (This pattern makes sense, since high BMI is typically associated with higher risk of chronic disease.)

To double-check the strength of this relationship, you might like to add a **regression line**, or the line that best fits the data.

In [36]:
fig7 = px.scatter(
    insurance_data, 
    x=insurance_data["bmi"], 
    y=insurance_data["charges"],
    trendline="ols")
fig7.show()

### Color-coded scatter plots
We can use scatter plots to display the relationships between (not two, but...) three variables! One way of doing this is by color-coding the points.

For instance, to understand how smoking affects the relationship between BMI and insurance costs, we can color-code the points by `"smoker"`, and plot the other two columns (`"bmi"`, `"charges"`) on the axes.

In [37]:
fig8 = px.scatter(
    insurance_data, 
    x=insurance_data["bmi"], 
    y=insurance_data["charges"],
    trendline="ols",
    color = "smoker")
fig8.show()

#trendline: str
#        One of `'ols'` or `'lowess'`. If `'ols'`, an Ordinary Least Squares
#        regression line will be drawn for each discrete-color/symbol group. If
#        `'lowess`', a Locally Weighted Scatterplot Smoothing line will be drawn
#        for each discrete-color/symbol group.

This scatter plot shows that while nonsmokers to tend to pay slightly more with increasing BMI, smokers pay **MUCH** more.

---

# Histograms
Explore distributions

---

## Load and examine the data
We'll work with a dataset of 150 different flowers, or 50 each from three different species of iris (Iris setosa, Iris versicolor, and Iris virginica).


Each row in the dataset corresponds to a different flower. There are four measurements: the sepal length and width, along with the petal length and width. We also keep track of the corresponding species.

In [38]:
# Path of the file to read
iris_filepath = "data/iris.csv"

# Read the file into a variable iris_data
iris_data = pd.read_csv(iris_filepath, index_col="Id")

# Print the first 5 rows of the data
iris_data.head()

Unnamed: 0_level_0,Sepal Length (cm),Sepal Width (cm),Petal Length (cm),Petal Width (cm),Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


## Histograms

Say we would like to create a histogram to see how petal length varies in iris flowers.

In [39]:
fig9 = px.histogram(iris_data, x="Petal Length (cm)")
fig9.show()

In [40]:
fig10 = px.histogram(iris_data, x="Petal Length (cm)", color = "Species", opacity = 0.7)
fig10.show()

# Excersice
Scenario
You work for a major candy producer, and your goal is to write a report that your company can use to guide the design of its next product. Soon after starting your research, you stumble across this very interesting dataset (https://fivethirtyeight.com/videos/the-ultimate-halloween-candy-power-ranking/) containing results from a fun survey to crowdsource favorite candies.

## Load the Data
Read the candy data file into candy_data. Use the "id" column to label the rows

In [41]:
# Path of the file to read
candy_filepath = # Write your code here

# Fill in the line below to read the file into a variable candy_data
candy_data = # Write your code here

SyntaxError: invalid syntax (4112596357.py, line 2)

## Review the data
Use a Python command to print the first five rows of the data.

In [None]:
# Print the first five rows of the data

# Write your code here

**The dataset contains 83 rows, where each corresponds to a different candy bar. There are 13 columns:**

**"competitorname"** contains the name of the candy bar.
the next 9 columns (from "chocolate" to "pluribus") describe the candy. For instance, rows with chocolate candies have "Yes" in the **"chocolate"** column (and candies without chocolate have "No" in the same column).
**"sugarpercent"** provides some indication of the amount of sugar, where higher values signify higher sugar content.
**"pricepercent"** shows the price per unit, relative to the other candies in the dataset.
**"winpercent"** is calculated from the survey results; higher values indicate that the candy was more popular with survey respondents.
Use the first five rows of the data to answer the questions below.

### The role of sugar
Do people tend to prefer candies with higher sugar content?

**Part A**

Create a scatter plot that shows the relationship between "sugarpercent" (on the horizontal x-axis) and "winpercent" (on the vertical y-axis). Don't add a regression line just yet, you'll do that in the next step!

In [None]:
# Write your code here

### Part B
Does the scatter plot show a **strong** correlation between the two variables? If so, are candies with more sugar relatively more or less popular with the survey respondents?

In [None]:
# Write your code here

## Take a closer look
### Part A
Create the same scatter plot you created in Step 3, but now with a regression line!

In [None]:
# Write your code here

### Part B
According to the plot above, is there a slight correlation between 'winpercent' and 'sugarpercent'? What does this tell you about the candy that people tend to prefer?

In [None]:
# Write your code here

## Chocolate!
In the code cell below, create a scatter plot to show the relationship between **"pricepercent"** (on the horizontal x-axis) and **"winpercent"** (on the vertical y-axis). Use the **"chocolate"** column to color-code the points. Don't add any regression lines just yet, you'll do that in the next step!

In [None]:
# Write your code here

### Investigate chocolate
### Part A

Create the same scatter plot you created in last step, but now with two regression lines, corresponding to (1) chocolate candies and (2) candies without chocolate.

In [None]:
# Write your code here

### Part B

Using the regression lines, what conclusions can you draw about the effects of chocolate and price on candy popularity?

\# Write your solution here

In [None]:
# Write your code here

## Uploading A Visualization to Plotly

In [None]:
username = "" # your username
api_key = "" # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

### Push your visualiztion to your account using the following lines of code

In [None]:
py.plot(fig2, filename = 'Title', auto_open=True)