In [1]:
def genDataset(n_features = 4, n_samples = 1000, low = 0, high = 10, weights = [0.25, 0.25, 0.25, 0.25], threshold = 5):
    """
    The function generates a random dataset with n predictor features and the result. The result
    can take two values, zero and one. The result depends on the threshold.
    
    :param n_features: Number of features.
    :param n_samples: Number of samples.
    :param low: The lowest value for the range of random values.
    :param high: The highest value for the range of random values.
    :param weight: List of the weights for each feature. There must be a weight for each feature.
    :param threshold: It is the value that marks the limit to assign zero or one to the result.
    
    :return: Dataframe 
    
    """
    import pandas as pd
    import numpy as np

    if len(weights) == n_features:
        randomData = np.random.randint(low = low, high = high , size=(n_samples, n_features))
        weights_m = np.asarray([weights]*n_samples)
        p = np.array([1 if sum(row) >= threshold else 0 for row in randomData * weights_m])
        res = pd.DataFrame(data=np.column_stack((randomData, p)),
                columns = ['feat_%s_%s' %(str(i), str(w)) for i, w in enumerate(weights)]+['result_thr_%s'%threshold])
 
        return res 
    else:
        print ("There must be a weight for each feature. Please, check the weights matrix.")

In [2]:
data = genDataset(n_features = 4, n_samples = 100000, low = 0, high = 10, weights = [0.125, 0.375, 0.375, 0.125], threshold = 6)

In [3]:
data.head()

Unnamed: 0,feat_0_0.125,feat_1_0.375,feat_2_0.375,feat_3_0.125,result_thr_6
0,0,6,4,1,0
1,8,7,7,4,1
2,4,5,6,3,0
3,5,2,5,7,0
4,6,6,4,5,0


In [5]:
import pandas as pd
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'],dtype=float)
print (df)

     Name   Age
0    Alex  10.0
1     Bob  12.0
2  Clarke  13.0


In [20]:
import random

def generate_data_Temp_High(number_of_samples):
    samples = []
    for i in range (number_of_samples):
        visibility = random.randint(50, 100)/100
        numMin = random.randint(540, 900)
        
        #generate month between 2 - 6 or between 10 - 12
        randMonth = random.randint(0, 1)
        if randMonth:
            numMonth = random.randint(2, 6)
        else:
            numMonth = random.randint(10, 12)
        
        #temperature
        temperature =  random.randint(20, 30)
        
        sample = [visibility, numMin, numMonth, temperature]
        samples.append(sample)
    return samples

def generate_data_Temp_Medium(number_of_samples):
    samples = []
    for i in range (number_of_samples):
        visibility = random.randint(40, 80)/100
        numMin = random.randint(840, 1080)
        
        #generate month between 2 - 6 or between 10 - 12
        randMonth = random.randint(0, 1)
        if randMonth:
            numMonth = random.randint(2, 6)
        else:
            numMonth = random.randint(10, 12)
        
        #temperature
        temperature =  random.randint(10, 20)
        
        sample = [visibility, numMin, numMonth, temperature]
        samples.append(sample)
    return samples

def generate_data_Temp_Low(number_of_samples):
    samples = []
    for i in range (number_of_samples):
        visibility = random.randint(0, 40)/100
        
        #generate min between 0 - 540 or between 1080 - 1440
        randMin = random.randint(0, 1)
        if randMin:
            numMin = random.randint(0, 540)
        else:
            numMin = random.randint(1080, 1440)
            
        #generate month between 1 or between 7 - 12
        randMonth = random.randint(0, 1)
        if randMonth:
            numMonth = 1
        else:
            numMonth = random.randint(7, 12)
        
        #temperature
        temperature =  random.randint(0, 10)
        
        sample = [visibility, numMin, numMonth, temperature]
        samples.append(sample)
    return samples
        
def generate_data(number_of_samples, percent_high, percent_medium, percen_low):
    num_samples_high = int(number_of_samples * percent_high)
    num_samples_medium = int(number_of_samples * percent_medium)
    num_samples_low = int(number_of_samples * percen_low)
    
    samples_high = generate_data_Temp_High(num_samples_high)
    samples_medium = generate_data_Temp_Medium(num_samples_medium)
    samples_low = generate_data_Temp_Low(num_samples_low)
    
    print("number of samples: %d" %(number_of_samples))  
    samples = []
    
data = generate_data(5, 0.5, 0.2, 0.3)
print(data)


number of samples: 5
None
