# Frequenties

Eerst alle imports voor deze les:

In [None]:
import pandas as pd
import math
import statistics as stat
import matplotlib.pyplot as plt

## Data inlezen

We lezen de data in:

In [None]:
laptops = pd.read_csv("laptops.csv", sep=";", decimal=",", header=0)
laptops.info()

In [None]:
laptops.head(n=10)

We zetten de datatypes juist:

In [None]:
cpu_generation_levels = ['Sandy Bridge', 'Ivy Bridge', 'Haswell', 'Broadwell', 'Skylake', 'Kabylake']
laptops.cpuGeneration = pd.Categorical(laptops.cpuGeneration, categories=cpu_generation_levels, ordered=True)
cpu_levels = ['i3', 'i5', 'i7']
laptops.cpuType = pd.Categorical(laptops.cpuType, categories=cpu_levels, ordered=True)
laptops.brand = pd.Categorical(laptops.brand)
laptops.info()

## Absolute frequenties

We berekenen de absolute frequenties voor cpuType:

In [None]:
freqs = laptops.cpuType.value_counts()
print(freqs)

In [None]:
freqs = laptops.cpuType.value_counts(dropna=False)
print(freqs)

In [None]:
freqs = laptops.cpuType.value_counts().sort_index()
print(freqs)

In [None]:
freqs = laptops.cpuType.value_counts(dropna=False).sort_index()
print(freqs)

## Klassen

In [None]:
freqs = laptops.diskspace.value_counts()
print(freqs)

In [None]:
cutpoints = range(0, 1200, 100)
print(list(cutpoints))

In [None]:
klassen = pd.cut(laptops.diskspace, bins=cutpoints)
print(klassen)

In [None]:
print(pd.DataFrame({'waarde':laptops.diskspace, 'klasse':klassen}))

Nu kunnen we de frequenties berekenen:

In [None]:
freqs = klassen.value_counts().sort_index()
print(freqs)

In [None]:
klassen = pd.cut(laptops.diskspace, bins=cutpoints, right=False)
print(klassen.value_counts().sort_index())

In [None]:
klassen = pd.cut(laptops.diskspace, bins=cutpoints, include_lowest=True)
print(klassen.value_counts().sort_index())

Aantal klassen bepalen:

Sturges:

In [None]:
diskspace = laptops.diskspace.dropna()
n = len(diskspace)
sturges = math.ceil(1 + math.log2(n))
print(sturges)

Scott:

In [None]:
breedte = 3.5 * stat.stdev(diskspace) / (n ** (1/3))
scott = math.ceil((diskspace.max()-diskspace.min())/breedte)
print(scott)

Excel:

In [None]:
excel = math.ceil(math.sqrt(n))
print(excel)

In [None]:
print(diskspace.value_counts(bins=11).sort_index())

## Relatieve frequenties

In [None]:
freqs = laptops.brand.value_counts(normalize=True)
print(freqs)

In [None]:
freqs = (laptops.brand.value_counts(normalize=True) * 100).round(1)
print(freqs)

## Cumulatieve frequenties

In [None]:
x = pd.Series(range(1,6))
print(x.tolist())
print(x.cumsum().tolist())

In [None]:
freqs = laptops.cpuGeneration.value_counts().sort_index().cumsum()
print(freqs)

In [None]:
freqs = laptops.brand.value_counts().sort_index().cumsum()
print(freqs) # is dit zinvol?  Waarom of waarom niet?

## Cumulatieve percentages

Ook "percentielscores" genoemd

In [None]:
freqs = (laptops.cpuGeneration.value_counts(normalize=True).sort_index().cumsum() * 100).round(1)
print(freqs)

## Grafieken

Eenvoudig voorbeeld:

In [None]:
fig, ax = plt.subplots()
ax.plot([1,2,3,4], [0,4,6,7])
ax.set_xlabel("tijd (seconden)")
ax.set_ylabel("temperatuur (Kelvin)")
ax.set_title("Temperatuurverloop")
# fig.show()
# fig.savefig('beeldje.png')

In [None]:
# iets uitgebreider
fig, ax = plt.subplots()
ax.plot([1, 2, 3, 4], [0, 4, 6, 7], 'o-', label='kamer1')
ax.plot(range(1,5), [6, 5, 2, 1], label='kamer2', color='red', linestyle='dashed', marker='d')
ax.legend()
ax.grid(linestyle='--')
ax.set_xlabel('tijd (seconden)')
ax.set_ylabel('temperatuur (Kelvin)')
ax.set_title('Temperatuurverloop')
# fig.show()
# fig.savefig('beeldje.png')

Taartdiagram:

In [None]:
x = laptops.RAM.value_counts().sort_index()
l = ["1GB", "2GB", "4GB", "8GB", "16GB"]
fig, ax = plt.subplots()
ax.pie(x, labels=l)
ax.set_title("RAM in laptops")
# fig.show()

Staafdiagram:

In [None]:
x = laptops.RAM.value_counts().sort_index()
l = ["1GB", "2GB", "4GB", "8GB", "16GB"]
fig, ax = plt.subplots()
ax.bar(l, x)
ax.grid(linestyle='--', axis='y')
ax.set_title("RAM in laptops")
ax.set_xlabel("RAM geheugen (GB)")
ax.set_ylabel("Aantal laptops")
# fig.show()

Histogram:

In [None]:
fig, ax = plt.subplots()
ax.hist(laptops.diskspace)
ax.grid(linestyle='--', axis='y')
ax.set_title("Schijfruimte in laptops")
ax.set_xlabel("Vrije schijfruimte (GB)")
ax.set_ylabel("Aantal laptops")
# fig.show()

In [None]:
fig, ax = plt.subplots()
ax.hist(laptops.diskspace, bins=3)
ax.grid(linestyle='--', axis='y')
ax.set_title("Schijfruimte in laptops")
ax.set_xlabel("Vrije schijfruimte (GB)")
ax.set_ylabel("Aantal laptops")
# fig.show()

In [None]:
cutpoints = [0, 120, 250, 500, 1000]
fig, ax = plt.subplots()
ax.hist(laptops.diskspace, bins=cutpoints)
ax.grid(linestyle='--', axis='y')
ax.set_title("Schijfruimte in laptops")
ax.set_xlabel("Vrije schijfruimte (GB)")
ax.set_ylabel("Aantal laptops")
# fig.show()

In [None]:
cutpoints = [0, 120, 250, 500, 1000]
l = ["120", "250", "500", "1000"]
x = laptops.diskspace.value_counts(bins=cutpoints).sort_index()
fig, ax = plt.subplots()
ax.bar(l, x)
ax.grid(linestyle='--', axis='y')
ax.set_title("Harde schijven in laptops")
ax.set_xlabel("Schijfcapaciteit (GB)")
ax.set_ylabel("Aantal laptops")
# fig.show()

## Spider plots

In [None]:
x = laptops.brand
freqs = x.value_counts()
categories = freqs.index
values = freqs.values.tolist()
values += values[:1]
aantal = len(freqs)
maximum = max(values)
angles = [n / float(aantal) * 2 * math.pi for n in range(aantal)]
angles += angles[:1]
plt.figure()
ax = plt.subplot(111, polar=True)
plt.xticks(angles[:-1], categories, color='grey', size=8)
ax.set_rlabel_position(0)
plt.yticks([n/4*maximum for n in range(4)], [n/4*maximum for n in range(4)], color="grey", size=7)
plt.ylim(0, maximum)
plt.plot(angles, values, linewidth=1, linestyle='solid')
plt.fill(angles, values, 'b', alpha=0.1)
# plt.show()

## Wordclouds

In [None]:
import wordcloud as wc
tekst="imagine there's no heaven it's easy if you try no hell below us above us only sky imagine all the people living for today  imagine there's no countries it isn't hard to do nothing to kill or die for and no religion, too imagine all the people living life in peace you you may say I'm a dreamer but I'm not the only one I hope someday you'll join us and the world will be as one imagine no possessions I wonder if you can no need for greed or hunger a brotherhood of man imagine all the people sharing all the world you you may say I'm a dreamer but I'm not the only one I hope someday you'll join us and the world will live as one"
cloud=wc.WordCloud(max_font_size=70, max_words=20, background_color="white", random_state=13).generate(tekst)
plt.figure()
plt.imshow(cloud, interpolation='bilinear')
plt.axis("off")
# plt.show()