In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# import visualization
import matplotlib.pyplot as plt

# import machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# load data
fires = pd.read_csv('../input/forest-fires-data-set/forestfires.csv')
fires.head()

In [None]:
plt.hist(fires.area, bins=20)
plt.title('Area Distribution')
plt.show()

Long-tailed distribution with small number of samples for high area values.

In [None]:
# simple regression model
df = pd.get_dummies(fires.drop(columns=['X', 'Y']))

X = df.drop(columns=['area'])
y = df[['area']]

scaler = StandardScaler()
X = scaler.fit_transform(X,y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train.values.ravel())

y_pred = model.predict(X_test)

# print out the prediction scores
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))
print('MAE: {}'.format(mean_absolute_error(y_test, y_pred)))
print('R-squared: {}'.format(explained_variance_score(y_test, y_pred)))

In [None]:
plt.scatter(y_pred, y_test)
plt.plot(y_test, y_test, c = 'orange')
plt.xlabel('prediction')
plt.ylabel('true values')
plt.xlim(0,1000)
plt.ylim(0,1000)
plt.title('Predicted vs True values')

We see that the model performs very bad. Ideally the dots should be near the line.

In [None]:
# define sigmoid function
# https://en.wikipedia.org/wiki/Sigmoid_function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# plot sigmoid function
plt.plot(np.linspace(-10, 10, 100), sigmoid(np.linspace(-10, 10, 100)))
plt.title('Sigmoid Function')
plt.xlabel('x')
plt.axhline(y=0, c='black', linestyle=':')
plt.axvline(x=0, c='black', linestyle=':')
plt.ylabel('sigmoid(x)')
plt.show()

In [None]:
# plot the boxplot of area distribution
plt.boxplot(fires.area, vert=False)
plt.title('Area Distribution')
plt.xlabel('area')
plt.show()

In [None]:
# implement relevance function
# see paper: https://www.researchgate.net/publication/220699419_Utility-Based_Regression
def relevance(x):
    x = np.array(x)
    return sigmoid(x - 170)

# plot relevance function
plt.plot(np.linspace(0, 1000, 1000), relevance(np.linspace(0, 1000, 1000)))
plt.title('Relevance Function')
plt.xlabel('x')
plt.axhline(y=0, c='black', linestyle=':')
plt.axvline(x=0, c='black', linestyle=':')
plt.ylabel('relevance(x)')
plt.show()

In [None]:
# implement SMOTER
# see paper: https://core.ac.uk/download/pdf/29202178.pdf

def get_synth_cases(D, target, o=200, k=3, categorical_col = []):
    '''
    Function to generate the new cases.
    INPUT:
        D - pd.DataFrame with the initial data
        target - string name of the target column in the dataset
        o - oversampling rate
        k - number of nearest neighbors to use for the generation
        categorical_col - list of categorical column names
    OUTPUT:
        new_cases - pd.DataFrame containing new generated cases
    '''
    new_cases = pd.DataFrame(columns = D.columns) # initialize the list of new cases 
    ng = o // 100 # the number of new cases to generate
    for index, case in D.iterrows():
        # find k nearest neighbors of the case
        knn = KNeighborsRegressor(n_neighbors = k+1) # k+1 because the case is the nearest neighbor to itself
        knn.fit(D.drop(columns = [target]).values, D[[target]])
        neighbors = knn.kneighbors(case.drop(labels = [target]).values.reshape(1, -1), return_distance=False).reshape(-1)
        neighbors = np.delete(neighbors, np.where(neighbors == index))
        for i in range(0, ng):
            # randomly choose one of the neighbors
            x = D.iloc[neighbors[np.random.randint(k)]]
            attr = {}          
            for a in D.columns:
                # skip target column
                if a == target:
                    continue;
                if a in categorical_col:
                    # if categorical then choose randomly one of values
                    if np.random.randint(2) == 0:
                        attr[a] = case[a]
                    else:
                        attr[a] = x[a]
                else:
                    # if continious column
                    diff = case[a] - x[a]
                    attr[a] = case[a] + np.random.randint(2) * diff
            # decide the target column
            new = np.array(list(attr.values()))
            d1 = cosine_similarity(new.reshape(1, -1), case.drop(labels = [target]).values.reshape(1, -1))[0][0]
            d2 = cosine_similarity(new.reshape(1, -1), x.drop(labels = [target]).values.reshape(1, -1))[0][0]
            attr[target] = (d2 * case[target] + d1 * x[target]) / (d1 + d2)
            
            # append the result
            new_cases = new_cases.append(attr,ignore_index = True)
                    
    return new_cases

def SmoteR(D, target, th = 0.999, o = 200, u = 100, k = 3, categorical_col = []):
    '''
    The implementation of SmoteR algorithm:
    https://core.ac.uk/download/pdf/29202178.pdf
    INPUT:
        D - pd.DataFrame - the initial dataset
        target - the name of the target column in the dataset
        th - relevance threshold
        o - oversampling rate
        u - undersampling rate
        k - the number of nearest neighbors
    OUTPUT:
        new_D - the resulting new dataset
    '''
    # median of the target variable
    y_bar = D[target].median()
    
    # find rare cases where target less than median
    rareL = D[(relevance(D[target]) > th) & (D[target] > y_bar)]
    # generate rare cases for rareL
    new_casesL = get_synth_cases(rareL, target, o, k , categorical_col)
    
    # find rare cases where target greater than median
    rareH = D[(relevance(D[target]) > th) & (D[target] < y_bar)]
    # generate rare cases for rareH
    new_casesH = get_synth_cases(rareH, target, o, k , categorical_col)
    
    new_cases = pd.concat([rareL, rareH], axis=0)
    
    # get the number of norm cases
    nr_norm = int(len(new_cases) * u / 100)
    
    # undersample norm cases
    norm_cases = D[relevance(D[target]) <= th].sample(min(len(D[relevance(D[target]) <= th]), nr_norm))
    
    # get the resulting dataset
    new_D = pd.concat([new_cases, norm_cases], axis=0)
    
    return new_D

In [None]:
test = pd.DataFrame(columns=['a', 'b', 'c'])
for i in range(5):
    test = test.append({'a':1, 'b':2, 'c':20.0}, ignore_index = True)
    test = test.append({'a':1, 'b':2, 'c':500.0}, ignore_index = True)
    test = test.append({'a':3, 'b':4, 'c':450.0}, ignore_index = True)
    test = test.append({'a':3, 'b':4, 'c':300.0}, ignore_index = True)
test

In [None]:
get_synth_cases(test, 'c', 100, 3, categorical_col = ['b'])

In [None]:
SmoteR(test, 'c', th = 0.999, o = 200, u = 300, k = 3, categorical_col = ['b'])