In [1]:
import pandas as pd
import numpy as np

In [3]:
pre_df = pd.read_csv('train.csv')

pre_df.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


In [4]:
def remove_zero_budget_rows(dataset):
    # Filter rows where the budget is not equal to 0
    cleaned_dataset = dataset[dataset['budget'] != 0]
    return cleaned_dataset

# Remove rows with zero budget
df = remove_zero_budget_rows(pre_df)

In [7]:
# Step 2: Preprocess the data
selected_features = ['budget', 'revenue']  # Use only 'budget'
df = df[selected_features].dropna()

# Step 3: Split the dataset into training and testing sets
train_size = int(0.8 * len(df))
train_df, test_df = df[:train_size], df[train_size:]

# Step 4: Train the decision tree regression model
class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(set(y)) == 1:
            # Create a leaf node
            return np.mean(y)

        # Find the best split
        feature, threshold, left_indices, right_indices = self._find_best_split(X, y)

        if left_indices is None or right_indices is None:
            # Create a leaf node if no split is found
            return np.mean(y)

        # Recursively build the left and right subtrees
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        # Create a node representing the split
        return {'feature': feature, 'threshold': threshold, 'left': left_subtree, 'right': right_subtree}

    def _find_best_split(self, X, y):
        best_feature, best_threshold, best_reduction = None, None, -np.inf

        for feature in X.columns:
            thresholds = np.unique(X[feature])

            for threshold in thresholds:
                left_indices = X[feature] <= threshold
                right_indices = ~left_indices

                if len(y[left_indices]) > 0 and len(y[right_indices]) > 0:
                    reduction = self._calculate_reduction(y, y[left_indices], y[right_indices])

                    if reduction > best_reduction:
                        best_feature, best_threshold, best_reduction = feature, threshold, reduction

        if best_reduction > 0:
            left_indices = X[best_feature] <= best_threshold
            right_indices = ~left_indices
            return best_feature, best_threshold, left_indices, right_indices
        else:
            return None, None, None, None

    def _calculate_reduction(self, parent, left_child, right_child):
        mse_parent = np.mean((parent - np.mean(parent))**2)
        mse_left = np.mean((left_child - np.mean(left_child))**2)
        mse_right = np.mean((right_child - np.mean(right_child))**2)

        reduction = mse_parent - (len(left_child) / len(parent) * mse_left + len(right_child) / len(parent) * mse_right)
        return reduction

    def predict(self, X):
        return np.array([self._predict_single(x, self.tree) for _, x in X.iterrows()])

    def _predict_single(self, x, node):
        if isinstance(node, (float, np.float64)):
            # Leaf node
            return node
        else:
            # Internal node
            if x[node['feature']] <= node['threshold']:
                return self._predict_single(x, node['left'])
            else:
                return self._predict_single(x, node['right'])

# Create an instance of the DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(max_depth=3)

# Train the model
X_train, y_train = train_df[['budget']], train_df['revenue']
tree_regressor.fit(X_train, y_train)

# Step 5: Evaluate the model on the testing set
X_test, y_test = test_df[['budget']], test_df['revenue']
predictions = tree_regressor.predict(X_test)

# Calculate the mean absolute error
mae = np.mean(np.abs(predictions - y_test))
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 58532017.473787256
