In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import re

# Load the dataset
data = pd.read_csv('random_forest_data.csv')

# Convert currency values to numerical values
def convert_currency(value):
    value = re.sub(r'[^\d.]', '', value)  # Remove non-numeric characters
    return float(value)

data['Productivity and Business Processes'] = data['Productivity and Business Processes'].apply(convert_currency)
data['Intelligent Cloud'] = data['Intelligent Cloud'].apply(convert_currency)
data['More Personal Computing'] = data['More Personal Computing'].apply(convert_currency)
data['Revenue'] = data['Revenue'].apply(convert_currency)

# Prepare the data
features = data[['Productivity and Business Processes', 'Intelligent Cloud', 'More Personal Computing']]
target = data['Revenue']

# Split the data into training and testing sets
train_size = int(0.8 * len(data))
train_features = features[:train_size]
train_target = target[:train_size]
test_features = features[train_size:]
test_target = target[train_size:]

# Train the random forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(train_features, train_target)

# Predict revenue for the testing set
predictions = model.predict(test_features)


# Feature importance
feature_importance = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(feature_importance_df)

# Determine the best segment
best_segment = feature_importance_df.iloc[0]['Feature']
print("Best Segment:", best_segment)

Feature Importance:
                               Feature  Importance
0  Productivity and Business Processes    0.499098
1                    Intelligent Cloud    0.428116
2              More Personal Computing    0.072786
Best Segment: Productivity and Business Processes
