### Agenda
0. Basic Setup
1. Basic Plotting
2. Reading and Describing Data
3. Filters, Drop, Add Columns
4. Replacing, Casting, Histogram 
5. Barplot, Dropna

# Part 0
Basic Setup

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from collections import Counter

In [None]:
%matplotlib inline
plt.style.use('seaborn-poster')

# Part 1
Basic Plotting

In [None]:
accuracy = [0.15, 0.231, 0.44, 0.56, 0.7111, 0.8, 0.905]
iterations = list(range(1000, 8000, 1000))

In [None]:
plt.plot(iterations, accuracy, color='red', linestyle='--', marker='*', linewidth=3, label='model_1')
# plt.plot(iterations, accuracy, '*--r', linewidth=3, label='model_1') 

plt.xlabel('Iterations')
plt.ylabel('Accuracy')
plt.title('Basic Plot')
plt.legend()
plt.tight_layout()

plt.show()

# plt.savefig(basic_plot.png)

# Part 2
Reading, Describing and Acessing Data

In [None]:
main_df = pd.read_csv('data/so_survey_2021.csv')
main_df.head()

### Table Schema
Stack Overflow annual survey 2021  

__Country__ : Where do you live?   
__US_State__ : In which state or territory of the USA do you live?  
__UK_Country__ : In which part of the United Kingdom do you live?  
__Age__ : 	What is your age?  
__YearsCode__ : Including any education, how many years have you been coding in total?  
__EdLevel__ : Which of the following best describes the highest level of formal education that you’ve completed?  
__DevType__ : Which of the following describes your current job? Please select all that apply.  
__LanguageHaveWorkedWith__ : Which programming, scripting, and markup languages have you done extensive development work in over the past year?  
__CompTotal__ : What is your current total compensation (salary, bonuses, and perks, before taxes and deductions), in $? If you are paid hourly, please estimate an equivalent weekly, monthly, or yearly salary.  

In [None]:
main_df.info()

In [None]:
main_df.shape

In [None]:
main_df.describe(include='all')

In [None]:
main_df.set_index('ResponseId')

In [None]:
main_df.loc[4]

In [None]:
main_df.loc[[14, 19]]

In [None]:
main_df.loc[14:19]

In [None]:
main_df.loc[14:19, 'Age']

In [None]:
main_df.columns

In [None]:
main_df.set_index(['Country', 'EdLevel'], inplace=True)

In [None]:
main_df.loc['Canada']

In [None]:
main_df.loc['Canada', 'Something else']

In [None]:
main_df.reset_index()

In [None]:
main_df.set_index('ResponseId', inplace=True)

# Part 3
Filters, Drop, Add Columns

In [None]:
main_df.drop(columns=['US_State', 'UK_Country'])   # inplace=True

In [None]:
main_df.info()

In [None]:
main_df.drop(main_df.index[0:100])   # inplace=True

In [None]:
filt = main_df['DevType'].str.contains('machine learning|data scientist', na=False, case=False)
main_df['DataScienceOrML'] = np.where(filt, True, False)

In [None]:
main_df.info()

In [None]:
main_df['DataScienceOrML'].value_counts()

# Part 4
Replacing, Casting, Histogram 

In [None]:
x = [0, 1, 1.9999, 3, 3.5, 4]

In [None]:
plt.hist(x, bins=[1, 2, 3, 4], edgecolor='black')

plt.tight_layout()

plt.show()

### How does it work?

```
plt.hist(x=some_data, bins=[1, 2, 3, 4])
```

1st bin: [1, 2)  
2nd bin: [2, 3)  
3rd bin: [3, 4]

In [None]:
plt.hist(x, bins=3, edgecolor='black', color='darkgreen')

plt.tight_layout()

plt.show()

In [None]:
main_df['YearsCode'].value_counts()

In [None]:
main_df['YearsCode'].replace('Less than 1 year', 0, inplace=True)
main_df['YearsCode'].replace('More than 50 years', 51, inplace=True)
main_df['YearsCode'] = main_df['YearsCode'].astype(float)

In [None]:
main_df['YearsCode'].describe()

In [None]:
main_df['YearsCode'].median()

In [None]:
plt.hist(main_df['YearsCode'], bins=15, edgecolor='black', color='orange')

plt.tight_layout()

plt.show()

In [None]:
bins = list(range(0, 53, 2))

In [None]:
plt.hist(main_df['YearsCode'], bins=bins, edgecolor='black')

plt.tight_layout()

plt.show()

# Part 5
Barplot, Dropna

In [None]:
main_df.columns.tolist()

In [None]:
# ids = main_df['ResponseId']
ids = main_df.index
langs = main_df['LanguageHaveWorkedWith']   # .dropna()
language_counter = Counter()

In [None]:
for response in langs:
    language_counter.update(response.split(';'))

In [None]:
languages, popularity = map(list, zip(*language_counter.most_common(10)))

In [None]:
plt.bar(languages, popularity, color='darkgreen')

plt.xticks(rotation=45)
plt.ylabel('Number of People Who Use')
plt.title('Most Popular Languages')

plt.show()