In [1]:
import pandas as pd
import numpy as np
import overpy
import random

In [2]:
random.seed(99162)

In [3]:
zip_codes = pd.read_csv("plz_coord.csv")

In [4]:
columns=["count_bees","count_hive","breed","club","zip_code","longtitude","latitude","distance_to_body_of_water","distance_to_field_pollen","incidents(3 months)"]

api = overpy.Overpass()
r = api.query("""
area["ISO3166-1"="AT"][admin_level=2];
(node["place"="village"](area);
 way["place"="village"](area);
 rel["place"="village"](area);
);
out center;
""")
coords  = []
coords += [(float(node.lon), float(node.lat)) 
           for node in r.nodes]
coords += [(float(way.center_lon), float(way.center_lat)) 
           for way in r.ways]
coords += [(float(rel.center_lon), float(rel.center_lat)) 
           for rel in r.relations]

In [5]:
def truncated_normal(mean, stddev, minval, maxval):
        return np.clip(np.random.normal(mean, stddev), minval, maxval)

In [6]:
bee_synth_list = []
for i in range(30000):
        count_hive = round(truncated_normal(10,3,1,100),0)
        count_bees = count_hive * random.randrange(5000,70000)
        breed = random.choices(["Apis mellifera mellifera","Apis mellifera ligustica","Apis mellifera carnica"],weights= [0.05,0.25,0.7])[0]
        club = random.choice(["Bienenzuchtverein Ö", "Bienierer","Beedom","Bienieten","Beeraten"])
        random_zip = zip_codes.sample()
        zip_code = int(random_zip["Zipcode"])
        longitude = float(random_zip["Longitude"])
        latitude = float(random_zip["Latitude"])
        incidents = round(truncated_normal(0,4,0,8),0)
        #coord = random.choice(coords)

        distance_to_body_of_water = round(truncated_normal(200,40,0,1000),0)
        distance_to_field_pollen = round(truncated_normal(200,40,0,1000),0)

        bee_list = [
                count_bees,
                count_hive,
                breed,
                club,
                zip_code,
                longitude,
                latitude,
                distance_to_body_of_water,
                distance_to_field_pollen,
                incidents
        ]
        bee_synth_list.append(bee_list)

anzahl vorfälle, umkreis neighbours, reinheit

In [7]:
df = pd.DataFrame(columns=columns,data=bee_synth_list)
df

Unnamed: 0,count_bees,count_hive,breed,club,zip_code,longtitude,latitude,distance_to_body_of_water,distance_to_field_pollen,incidents(3 months)
0,383364.0,12.0,Apis mellifera carnica,Bienieten,6181,11.1961,47.1674,271.0,201.0,8.0
1,463295.0,7.0,Apis mellifera carnica,Beeraten,2442,16.4464,47.9727,170.0,225.0,1.0
2,471424.0,8.0,Apis mellifera carnica,Beedom,6773,9.8087,47.0749,197.0,197.0,1.0
3,311628.0,6.0,Apis mellifera carnica,Beeraten,4101,14.0705,48.3562,120.0,281.0,1.0
4,584730.0,10.0,Apis mellifera carnica,Bienenzuchtverein Ö,8934,14.7001,47.7278,195.0,204.0,0.0
...,...,...,...,...,...,...,...,...,...,...
29995,472846.0,11.0,Apis mellifera ligustica,Bienenzuchtverein Ö,5524,13.4486,47.5248,155.0,239.0,5.0
29996,124911.0,9.0,Apis mellifera carnica,Bienenzuchtverein Ö,8542,15.2609,46.7654,176.0,208.0,2.0
29997,85437.0,9.0,Apis mellifera ligustica,Bienenzuchtverein Ö,9634,13.1263,46.6395,215.0,259.0,0.0
29998,68904.0,9.0,Apis mellifera carnica,Beedom,8614,15.4616,47.3902,184.0,166.0,0.0


In [8]:
amount_neighbours = df["zip_code"].value_counts().reset_index().rename(columns={"zip_code" : "amount_neighbours","index":"zip_code"})
df = df.merge(amount_neighbours,on="zip_code")

In [9]:
df_purity = df[["zip_code","breed"]]
df_purity = df_purity.value_counts().reset_index().rename(columns={0:"count"})
total = df_purity.groupby(by = "zip_code").sum().reset_index().rename(columns={"count":"total"})

df_purity = df_purity.merge(total,on="zip_code")
df_purity["purity"] = df_purity["count"] / df_purity["total"]

df_purity = df_purity.sort_values("purity",ascending = False).drop_duplicates("zip_code").drop(["breed","count","total"],axis = 1)

In [10]:
df = df.merge(df_purity,on="zip_code")
df

Unnamed: 0,count_bees,count_hive,breed,club,zip_code,longtitude,latitude,distance_to_body_of_water,distance_to_field_pollen,incidents(3 months),amount_neighbours,purity
0,383364.0,12.0,Apis mellifera carnica,Bienieten,6181,11.1961,47.1674,271.0,201.0,8.0,15,0.866667
1,309426.0,13.0,Apis mellifera carnica,Bienierer,6181,11.1961,47.1674,189.0,268.0,0.0,15,0.866667
2,324470.0,10.0,Apis mellifera carnica,Beedom,6181,11.1961,47.1674,206.0,198.0,3.0,15,0.866667
3,154179.0,9.0,Apis mellifera ligustica,Beeraten,6181,11.1961,47.1674,186.0,214.0,2.0,15,0.866667
4,737568.0,16.0,Apis mellifera carnica,Bienieten,6181,11.1961,47.1674,158.0,288.0,3.0,15,0.866667
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,680028.0,12.0,Apis mellifera carnica,Bienenzuchtverein Ö,4864,13.5247,47.9073,187.0,234.0,0.0,5,0.800000
29996,657080.0,10.0,Apis mellifera carnica,Bienieten,4864,13.5247,47.9073,234.0,230.0,0.0,5,0.800000
29997,142580.0,5.0,Apis mellifera carnica,Bienieten,4941,13.4313,48.2044,240.0,138.0,0.0,3,1.000000
29998,204512.0,14.0,Apis mellifera carnica,Bienierer,4941,13.4313,48.2044,143.0,207.0,3.0,3,1.000000


## Adding weather Data


get Data


In [11]:
import requests
import datetime as dt

In [12]:
city_weath = requests.get("https://api.openweathermap.org/data/2.5/weather?lat=48.21&lon=16.36&appid=667bf9f551aad1a305cdb30f4c89499d").json()
city_name="Vienna"

In [13]:
city_weath

{'coord': {'lon': 16.36, 'lat': 48.21},
 'weather': [{'id': 801,
   'main': 'Clouds',
   'description': 'few clouds',
   'icon': '02d'}],
 'base': 'stations',
 'main': {'temp': 295.16,
  'feels_like': 295.04,
  'temp_min': 294.02,
  'temp_max': 297.25,
  'pressure': 1017,
  'humidity': 62},
 'visibility': 10000,
 'wind': {'speed': 10.28, 'deg': 319, 'gust': 21.46},
 'clouds': {'all': 20},
 'dt': 1659253997,
 'sys': {'type': 2,
  'id': 2037452,
  'country': 'AT',
  'sunrise': 1659238103,
  'sunset': 1659292402},
 'timezone': 7200,
 'id': 2775259,
 'name': 'Inner city',
 'cod': 200}

### Dataframe

In [14]:
sun_hours = city_weath['sys']["sunset"] - city_weath['sys']["sunrise"]
sun_hours = sun_hours/60/60

In [15]:
city_df = {
    "lat": city_weath["coord"]["lat"],
    "long": city_weath["coord"]["lon"],
    "city": city_name,
    'temp': city_weath["main"]["temp"], 
    'humidity': city_weath['main']["humidity"],
    'windspeed': city_weath["wind"]['speed'], 
    'sun_hours': round(sun_hours,0) }

In [16]:
city_df

{'lat': 48.21,
 'long': 16.36,
 'city': 'Vienna',
 'temp': 295.16,
 'humidity': 62,
 'windspeed': 10.28,
 'sun_hours': 15.0}

In [17]:
weather_columns=["city","lat","long", "temp","humidity","windspeed","sun_hours"]

weather_df = pd.DataFrame(columns=weather_columns)
weather_df

Unnamed: 0,city,lat,long,temp,humidity,windspeed,sun_hours


add other cities

In [18]:
def city_df_export(city:str, query: str):
    city_weath = requests.get(query).json()
    city_name=city
    
    sun_hours = city_weath['sys']["sunset"] - city_weath['sys']["sunrise"]
    sun_hours = sun_hours/60/60

    city_df = {
    "lat": city_weath["coord"]["lat"],
    "long": city_weath["coord"]["lon"],
    "city": city_name,
    'temp': city_weath["main"]["temp"], 
    'humidity': city_weath['main']["humidity"],
    'windspeed': city_weath["wind"]['speed'], 
    'sun_hours': round(sun_hours,0) }
    return city_df



vienna_city = city_df_export("Vienna","https://api.openweathermap.org/data/2.5/weather?lat=48.21&lon=16.36&units=metric&appid=667bf9f551aad1a305cdb30f4c89499d")

weather_df = weather_df.append(vienna_city,ignore_index= True)

stp_city = city_df_export("St. Poelten","https://api.openweathermap.org/data/2.5/weather?lat=48.2&lon=15.63&units=metric&appid=667bf9f551aad1a305cdb30f4c89499d")

weather_df = weather_df.append(stp_city,ignore_index= True)

linz_city = city_df_export("Linz","https://api.openweathermap.org/data/2.5/weather?lat=48.3&lon=14.28&units=metric&appid=667bf9f551aad1a305cdb30f4c89499d")

weather_df = weather_df.append(linz_city,ignore_index= True)

bregenz_city = city_df_export("Bregenz","https://api.openweathermap.org/data/2.5/weather?lat=47.5&lon=9.74&units=metric&appid=667bf9f551aad1a305cdb30f4c89499d")

weather_df = weather_df.append(bregenz_city,ignore_index= True)

In [19]:
weather_df

Unnamed: 0,city,lat,long,temp,humidity,windspeed,sun_hours
0,Vienna,48.21,16.36,22.01,62,10.28,15.0
1,St. Poelten,48.2,15.63,22.05,87,4.35,15.0
2,Linz,48.3,14.28,20.49,72,4.63,15.0
3,Bregenz,47.5,9.74,22.48,76,1.54,15.0


## add to current dataframe


In [20]:
weather_df.sample()

Unnamed: 0,city,lat,long,temp,humidity,windspeed,sun_hours
2,Linz,48.3,14.28,20.49,72,4.63,15.0


In [21]:
df["temp"] = 0
df["humidity"] = 0
df["windspeed"] = 0
df["sun_hours"] = 0


In [22]:
sample = weather_df.sample()
sample["humidity"].to_string().split("  ")[2]

'62'

In [23]:
for i in range(30000):
    sample = weather_df.sample()
    df["temp"][i] = sample["temp"]
    df["humidity"][i] = sample["humidity"].to_string().split("  ")[2]
    df["windspeed"][i] = sample["windspeed"]
    df["sun_hours"][i] = sample["sun_hours"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["temp"][i] = sample["temp"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["humidity"][i] = sample["humidity"].to_string().split("  ")[2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["windspeed"][i] = sample["windspeed"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sun_hours"][i] = sample["s

In [33]:
df["good_place"] = 1
bad = df.loc[(df["distance_to_body_of_water"] > 300) | (df["distance_to_field_pollen"] > 300) | (df["incidents(3 months)"] > 3) | (df["purity"] < 0.5)].index

df.loc[bad,"good_place"] = 0

bad2 = df[df["good_place"]==1].sample(4000).index

df.loc[bad2,"good_place"] = 0

In [36]:
df.to_csv("synth_data.csv",sep=";")

In [35]:
df[df["good_place"]==0]

Unnamed: 0,count_bees,count_hive,breed,club,zip_code,longtitude,latitude,distance_to_body_of_water,distance_to_field_pollen,incidents(3 months),amount_neighbours,purity,temp,humidity,windspeed,sun_hours,good_place
0,383364.0,12.0,Apis mellifera carnica,Bienieten,6181,11.1961,47.1674,271.0,201.0,8.0,15,0.866667,20,72,4,15,0
8,163496.0,8.0,Apis mellifera carnica,Beeraten,6181,11.1961,47.1674,176.0,126.0,0.0,15,0.866667,22,62,10,15,0
12,207600.0,4.0,Apis mellifera carnica,Bienieten,6181,11.1961,47.1674,208.0,228.0,4.0,15,0.866667,22,62,10,15,0
14,91679.0,7.0,Apis mellifera ligustica,Bienenzuchtverein Ö,6181,11.1961,47.1674,235.0,263.0,8.0,15,0.866667,20,72,4,15,0
20,309684.0,6.0,Apis mellifera carnica,Beeraten,2442,16.4464,47.9727,222.0,218.0,0.0,13,0.692308,22,76,1,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29980,65298.0,6.0,Apis mellifera mellifera,Beedom,9821,13.2055,46.9274,206.0,198.0,5.0,7,0.571429,22,87,4,15,0
29987,414744.0,12.0,Apis mellifera carnica,Beedom,7000,16.5353,47.8383,196.0,197.0,0.0,6,0.833333,22,62,10,15,0
29988,131688.0,9.0,Apis mellifera carnica,Bienieten,7000,16.5353,47.8383,212.0,224.0,0.0,6,0.833333,22,76,1,15,0
29991,455952.0,12.0,Apis mellifera carnica,Bienenzuchtverein Ö,7000,16.5353,47.8383,191.0,174.0,4.0,6,0.833333,22,76,1,15,0
