## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset over here

In [2]:
data=pd.read_csv('market leader companies with high revenue.csv')

In [3]:
data.head()

Unnamed: 0,Rank,Company,Stock Symbol,Revenue TTM:(USD) in Billion,share price (USD),Company Origin
0,1,Walmart,WMT,648000000000.0,60.62,United States
1,2,Amazon,AMZN,591000000000.0,188.76,United States
2,3,Saudi Aramco,2222.SR,495000000000.0,7.99221,Saudi Arabia
3,4,Sinopec,600028.SS,474000000000.0,0.882565,China
4,5,Berkshire Hathaway,BRK-B,439000000000.0,406.14,United States


In [5]:
data.shape

(8395, 6)

## Taking care of duplicate observations if present over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [7]:
data.isnull().sum()

Rank                            0
Company                         0
Stock Symbol                    1
Revenue TTM:(USD) in Billion    0
share price (USD)               0
Company Origin                  2
dtype: int64

In [8]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

Company Origin


In [9]:
data.dropna(inplace=True)

## Filtering all the numerical features over here

In [10]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Rank
Revenue TTM:(USD) in Billion
share price (USD)


In [11]:
data[numerical_features]

Unnamed: 0,Rank,Revenue TTM:(USD) in Billion,share price (USD)
0,1,6.480000e+11,60.620000
1,2,5.910000e+11,188.760000
2,3,4.950000e+11,7.992210
3,4,4.740000e+11,0.882565
4,5,4.390000e+11,406.140000
...,...,...,...
8390,8391,-1.308743e+09,16.350000
8391,8392,-1.957542e+09,242.711000
8392,8393,-4.930551e+09,1.397320
8393,8394,-1.056529e+10,2.531930


## Filtering all the categorical features over here

In [12]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Company
Stock Symbol
Company Origin


In [14]:
data[cat_features]

Unnamed: 0,Company,Stock Symbol,Company Origin
0,Walmart,WMT,United States
1,Amazon,AMZN,United States
2,Saudi Aramco,2222.SR,Saudi Arabia
3,Sinopec,600028.SS,China
4,Berkshire Hathaway,BRK-B,United States
...,...,...,...
8390,Bright Health,BHG,United States
8391,Sofina,SOF.VI,Belgium
8392,Quilter,QLT.L,United Kingdom
8393,M&G plc,MNG.L,United Kingdom


## Encoding the categorical features into numerical features for machine understanding over here

In [15]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [16]:
data

Unnamed: 0,Rank,Company,Stock Symbol,Revenue TTM:(USD) in Billion,share price (USD),Company Origin
0,1,0,0,6.480000e+11,60.620000,0
1,2,1,1,5.910000e+11,188.760000,0
2,3,2,2,4.950000e+11,7.992210,1
3,4,3,3,4.740000e+11,0.882565,2
4,5,4,4,4.390000e+11,406.140000,0
...,...,...,...,...,...,...
8390,8391,8384,8387,-1.308743e+09,16.350000,0
8391,8392,8385,8388,-1.957542e+09,242.711000,23
8392,8393,8386,8389,-4.930551e+09,1.397320,6
8393,8394,8387,8390,-1.056529e+10,2.531930,6


In [17]:
data['RevenueTTM:(USD) in Billion']=data['Revenue TTM:(USD) in Billion']

In [18]:
data.drop("Revenue TTM:(USD) in Billion",axis=1,inplace=True)

## Creating the features and labels over here

In [20]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splittting the dataset into training set and testing set to avoid the problem of overfitting over here

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here

In [25]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing set over here

In [27]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[3.16e+09 3.16e+09]
 [2.18e+09 2.18e+09]
 [1.11e+08 1.11e+08]
 ...
 [9.21e+07 9.19e+07]
 [2.06e+09 2.06e+09]
 [1.49e+09 1.48e+09]]


## Checking the metrics over here

In [28]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9980543958143514